This is page 1 of 4. Use http://codebase.md/allenday/solr-mcp?lines=true&page={x} to view the full context.
# Directory Structure
```
├── .flake8
├── .gitignore
├── CHANGELOG.md
├── CLAUDE.md
├── CONTRIBUTING.md
├── data
│   ├── bitcoin-whitepaper.json
│   ├── bitcoin-whitepaper.md
│   └── README.md
├── docker-compose.yml
├── LICENSE
├── poetry.lock
├── pyproject.toml
├── QUICKSTART.md
├── README.md
├── scripts
│   ├── check_solr.py
│   ├── create_test_collection.py
│   ├── create_unified_collection.py
│   ├── demo_hybrid_search.py
│   ├── demo_search.py
│   ├── diagnose_search.py
│   ├── direct_mcp_test.py
│   ├── format.py
│   ├── index_documents.py
│   ├── lint.py
│   ├── prepare_data.py
│   ├── process_markdown.py
│   ├── README.md
│   ├── setup.sh
│   ├── simple_index.py
│   ├── simple_mcp_test.py
│   ├── simple_search.py
│   ├── unified_index.py
│   ├── unified_search.py
│   ├── vector_index_simple.py
│   ├── vector_index.py
│   └── vector_search.py
├── solr_config
│   └── unified
│       └── conf
│           ├── schema.xml
│           ├── solrconfig.xml
│           ├── stopwords.txt
│           └── synonyms.txt
├── solr_mcp
│   ├── __init__.py
│   ├── server.py
│   ├── solr
│   │   ├── __init__.py
│   │   ├── client.py
│   │   ├── collections.py
│   │   ├── config.py
│   │   ├── constants.py
│   │   ├── exceptions.py
│   │   ├── interfaces.py
│   │   ├── query
│   │   │   ├── __init__.py
│   │   │   ├── builder.py
│   │   │   ├── executor.py
│   │   │   ├── parser.py
│   │   │   └── validator.py
│   │   ├── response.py
│   │   ├── schema
│   │   │   ├── __init__.py
│   │   │   ├── cache.py
│   │   │   └── fields.py
│   │   ├── utils
│   │   │   ├── __init__.py
│   │   │   └── formatting.py
│   │   ├── vector
│   │   │   ├── __init__.py
│   │   │   ├── manager.py
│   │   │   └── results.py
│   │   └── zookeeper.py
│   ├── tools
│   │   ├── __init__.py
│   │   ├── base.py
│   │   ├── solr_default_vectorizer.py
│   │   ├── solr_list_collections.py
│   │   ├── solr_list_fields.py
│   │   ├── solr_select.py
│   │   ├── solr_semantic_select.py
│   │   ├── solr_vector_select.py
│   │   └── tool_decorator.py
│   ├── utils.py
│   └── vector_provider
│       ├── __init__.py
│       ├── clients
│       │   ├── __init__.py
│       │   └── ollama.py
│       ├── constants.py
│       ├── exceptions.py
│       └── interfaces.py
├── solr.Dockerfile
└── tests
    ├── __init__.py
    ├── integration
    │   ├── __init__.py
    │   └── test_direct_solr.py
    └── unit
        ├── __init__.py
        ├── conftest.py
        ├── fixtures
        │   ├── __init__.py
        │   ├── common.py
        │   ├── config_fixtures.py
        │   ├── http_fixtures.py
        │   ├── server_fixtures.py
        │   ├── solr_fixtures.py
        │   ├── time_fixtures.py
        │   ├── vector_fixtures.py
        │   └── zookeeper_fixtures.py
        ├── solr
        │   ├── schema
        │   │   └── test_fields.py
        │   ├── test_client.py
        │   ├── test_config.py
        │   ├── utils
        │   │   └── test_formatting.py
        │   └── vector
        │       └── test_results.py
        ├── test_cache.py
        ├── test_client.py
        ├── test_config.py
        ├── test_formatting.py
        ├── test_interfaces.py
        ├── test_parser.py
        ├── test_query.py
        ├── test_schema.py
        ├── test_utils.py
        ├── test_validator.py
        ├── test_vector.py
        ├── test_zookeeper.py
        ├── tools
        │   ├── test_base.py
        │   ├── test_init.py
        │   ├── test_solr_default_vectorizer.py
        │   ├── test_solr_list_collections.py
        │   ├── test_solr_list_fields.py
        │   ├── test_tool_decorator.py
        │   └── test_tools.py
        └── vector_provider
            ├── test_constants.py
            ├── test_exceptions.py
            ├── test_interfaces.py
            └── test_ollama.py
```
# Files
--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
```
1 | [flake8]
2 | max-line-length = 88
3 | extend-ignore = E203
4 | exclude = .venv,.git,__pycache__,build,dist
```
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
```
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | *.bak
  7 | *.un~
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | share/python-wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | MANIFEST
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .nox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *.cover
 52 | *.py,cover
 53 | .hypothesis/
 54 | .pytest_cache/
 55 | cover/
 56 | 
 57 | # Translations
 58 | *.mo
 59 | *.pot
 60 | 
 61 | # Django stuff:
 62 | *.log
 63 | local_settings.py
 64 | db.sqlite3
 65 | db.sqlite3-journal
 66 | 
 67 | # Flask stuff:
 68 | instance/
 69 | .webassets-cache
 70 | 
 71 | # Scrapy stuff:
 72 | .scrapy
 73 | 
 74 | # Sphinx documentation
 75 | docs/_build/
 76 | 
 77 | # PyBuilder
 78 | .pybuilder/
 79 | target/
 80 | 
 81 | # Jupyter Notebook
 82 | .ipynb_checkpoints
 83 | 
 84 | # IPython
 85 | profile_default/
 86 | ipython_config.py
 87 | 
 88 | # pyenv
 89 | #   For a library or package, you might want to ignore these files since the code is
 90 | #   intended to run in multiple environments; otherwise, check them in:
 91 | # .python-version
 92 | 
 93 | # pipenv
 94 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 95 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 96 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 97 | #   install all needed dependencies.
 98 | #Pipfile.lock
 99 | 
100 | # UV
101 | #   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
102 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
103 | #   commonly ignored for libraries.
104 | #uv.lock
105 | 
106 | # poetry
107 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
108 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
109 | #   commonly ignored for libraries.
110 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
111 | #poetry.lock
112 | 
113 | # pdm
114 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
115 | #pdm.lock
116 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
117 | #   in version control.
118 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
119 | .pdm.toml
120 | .pdm-python
121 | .pdm-build/
122 | 
123 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
124 | __pypackages__/
125 | 
126 | # Celery stuff
127 | celerybeat-schedule
128 | celerybeat.pid
129 | 
130 | # SageMath parsed files
131 | *.sage.py
132 | 
133 | # Environments
134 | .env
135 | .venv
136 | env/
137 | venv/
138 | ENV/
139 | env.bak/
140 | venv.bak/
141 | 
142 | # Spyder project settings
143 | .spyderproject
144 | .spyproject
145 | 
146 | # Rope project settings
147 | .ropeproject
148 | 
149 | # mkdocs documentation
150 | /site
151 | 
152 | # mypy
153 | .mypy_cache/
154 | .dmypy.json
155 | dmypy.json
156 | 
157 | # Pyre type checker
158 | .pyre/
159 | 
160 | # pytype static type analyzer
161 | .pytype/
162 | 
163 | # Cython debug symbols
164 | cython_debug/
165 | 
166 | # PyCharm
167 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
168 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
169 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
170 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
171 | #.idea/
172 | 
173 | # Ruff stuff:
174 | .ruff_cache/
175 | 
176 | # PyPI configuration file
177 | .pypirc
178 | 
```
--------------------------------------------------------------------------------
/scripts/README.md:
--------------------------------------------------------------------------------
```markdown
 1 | # Utility Scripts for Solr MCP
 2 | 
 3 | This directory contains utility scripts for working with the Solr MCP server.
 4 | 
 5 | ## Scripts
 6 | 
 7 | ### demo_search.py
 8 | 
 9 | Demonstrates how to use the MCP client to search for information using both text search and vector search.
10 | 
11 | **Usage:**
12 | 
13 | ```bash
14 | # Text search
15 | python demo_search.py "bitcoin mining" --collection vectors
16 | 
17 | # Vector (semantic) search
18 | python demo_search.py "How does Bitcoin prevent double-spending?" --vector --collection vectors
19 | 
20 | # Specify number of results
21 | python demo_search.py "blockchain" --results 10
22 | ```
23 | 
24 | The script shows how to connect to the MCP server, perform different types of searches, and display the results.
25 | 
26 | ### process_markdown.py
27 | 
28 | Splits markdown files into sections based on headings and converts them to JSON documents ready for Solr indexing.
29 | 
30 | **Usage:**
31 | 
32 | ```bash
33 | # Process a markdown file and output to stdout
34 | python process_markdown.py data/document.md
35 | 
36 | # Process a markdown file and save to a JSON file
37 | python process_markdown.py data/document.md --output data/processed/document_sections.json
38 | ```
39 | 
40 | The script supports markdown files with YAML frontmatter. The frontmatter metadata will be added to each section document.
41 | 
42 | ### index_documents.py
43 | 
44 | Indexes documents from a JSON file into Solr with vector embeddings generated using Ollama's nomic-embed-text model.
45 | 
46 | **Usage:**
47 | 
48 | ```bash
49 | # Index documents into the default collection
50 | python index_documents.py data/processed/document_sections.json
51 | 
52 | # Index documents into a specific collection
53 | python index_documents.py data/processed/document_sections.json --collection my_collection
54 | 
55 | # Index documents without committing (useful for batch indexing)
56 | python index_documents.py data/processed/document_sections.json --no-commit
57 | ```
58 | 
59 | ## Workflow Example
60 | 
61 | 1. Process a markdown file:
62 | 
63 | ```bash
64 | python process_markdown.py data/document.md --output data/processed/document_sections.json
65 | ```
66 | 
67 | 2. Start the Docker containers (if not already running):
68 | 
69 | ```bash
70 | docker-compose up -d
71 | ```
72 | 
73 | 3. Index the documents with vector embeddings:
74 | 
75 | ```bash
76 | python index_documents.py data/processed/document_sections.json --collection vectors
77 | ```
78 | 
79 | 4. Use the MCP server to search the documents:
80 | 
81 | ```bash
82 | # Configure Claude Desktop to use the MCP server
83 | # Then ask questions about the document
84 | ```
```
--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
```markdown
 1 | # Data Examples for Solr MCP
 2 | 
 3 | This directory contains example data for testing and demonstrating the Solr MCP server.
 4 | 
 5 | ## Bitcoin Whitepaper Example
 6 | 
 7 | The Bitcoin whitepaper by Satoshi Nakamoto is included as an example document for testing semantic search capabilities.
 8 | 
 9 | ### Files
10 | 
11 | - `bitcoin-whitepaper.md`: The original Bitcoin whitepaper in markdown format
12 | - `processed/bitcoin_sections.json`: The whitepaper split into sections, ready for indexing
13 | - `processed/bitcoin_metadata.md`: Example with proper YAML frontmatter metadata
14 | - `processed/bitcoin_metadata.json`: Processed version with metadata included
15 | 
16 | ### Using the Bitcoin Whitepaper Example
17 | 
18 | 1. **Process the whitepaper into sections** (already done):
19 | 
20 | ```bash
21 | python scripts/process_markdown.py data/bitcoin-whitepaper.md --output data/processed/bitcoin_sections.json
22 | ```
23 | 
24 | 2. **Start the Docker containers**:
25 | 
26 | ```bash
27 | docker-compose up -d
28 | ```
29 | 
30 | 3. **Index the sections with vector embeddings**:
31 | 
32 | ```bash
33 | python scripts/index_documents.py data/processed/bitcoin_sections.json --collection vectors
34 | ```
35 | 
36 | 4. **Search using Claude Desktop**:
37 | 
38 | Configure Claude Desktop to use your MCP server, then ask questions like:
39 | 
40 | - "How does Bitcoin solve the double-spending problem?"
41 | - "Explain Bitcoin's proof-of-work system"
42 | - "What is the incentive for nodes to support the network?"
43 | 
44 | The MCP server will find the most semantically relevant sections from the whitepaper and return them to Claude.
45 | 
46 | ## Adding Your Own Documents
47 | 
48 | You can add your own documents to this directory and process them using the same workflow:
49 | 
50 | 1. Add markdown documents to the `data/` directory
51 | 2. Process them into sections:
52 | 
53 | ```bash
54 | python scripts/process_markdown.py data/your-document.md --output data/processed/your-document_sections.json
55 | ```
56 | 
57 | 3. Index them into Solr:
58 | 
59 | ```bash
60 | python scripts/index_documents.py data/processed/your-document_sections.json --collection vectors
61 | ```
62 | 
63 | ### YAML Frontmatter
64 | 
65 | For better document organization, add YAML frontmatter to your markdown files:
66 | 
67 | ```markdown
68 | ---
69 | title: "Document Title"
70 | author: "Author Name"
71 | date: "2023-01-01"
72 | tags:
73 |   - tag1
74 |   - tag2
75 | categories:
76 |   - category1
77 |   - category2
78 | ---
79 | 
80 | # Your Document Content
81 | ...
82 | ```
83 | 
84 | This metadata will be included in the indexed documents and can be used for filtering searches.
```
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
```markdown
 1 | # Solr MCP
 2 | 
 3 | A Python package for accessing Apache Solr indexes via Model Context Protocol (MCP). This integration allows AI assistants like Claude to perform powerful search queries against your Solr indexes, combining both keyword and vector search capabilities.
 4 | 
 5 | ## Features
 6 | 
 7 | - **MCP Server**: Implements the Model Context Protocol for integration with AI assistants
 8 | - **Hybrid Search**: Combines keyword search precision with vector search semantic understanding
 9 | - **Vector Embeddings**: Generates embeddings for documents using Ollama with nomic-embed-text
10 | - **Unified Collections**: Store both document content and vector embeddings in the same collection
11 | - **Docker Integration**: Easy setup with Docker and docker-compose
12 | - **Optimized Vector Search**: Efficiently handles combined vector and SQL queries by pushing down SQL filters to the vector search stage, ensuring optimal performance even with large result sets and pagination
13 | 
14 | ## Architecture
15 | 
16 | ### Vector Search Optimization
17 | 
18 | The system employs an important optimization for combined vector and SQL queries. When executing a query that includes both vector similarity search and SQL filters:
19 | 
20 | 1. SQL filters (WHERE clauses) are pushed down to the vector search stage
21 | 2. This ensures that vector similarity calculations are only performed on documents that will match the final SQL criteria
22 | 3. Significantly improves performance for queries with:
23 |    - Selective WHERE clauses
24 |    - Pagination (LIMIT/OFFSET)
25 |    - Large result sets
26 | 
27 | This optimization reduces computational overhead and network transfer by minimizing the number of vector similarity calculations needed.
28 | 
29 | ## Quick Start
30 | 
31 | 1. Clone this repository
32 | 2. Start SolrCloud with Docker:
33 |    ```bash
34 |    docker-compose up -d
35 |    ```
36 | 3. Install dependencies:
37 |    ```bash
38 |    python -m venv venv
39 |    source venv/bin/activate  # On Windows: venv\Scripts\activate
40 |    pip install poetry
41 |    poetry install
42 |    ```
43 | 4. Process and index the sample document:
44 |    ```bash
45 |    python scripts/process_markdown.py data/bitcoin-whitepaper.md --output data/processed/bitcoin_sections.json
46 |    python scripts/create_unified_collection.py unified
47 |    python scripts/unified_index.py data/processed/bitcoin_sections.json --collection unified
48 |    ```
49 | 5. Run the MCP server:
50 |    ```bash
51 |    poetry run python -m solr_mcp.server
52 |    ```
53 | 
54 | For more detailed setup and usage instructions, see the [QUICKSTART.md](QUICKSTART.md) guide.
55 | 
56 | ## Requirements
57 | 
58 | - Python 3.10 or higher
59 | - Docker and Docker Compose
60 | - SolrCloud 9.x
61 | - Ollama (for embedding generation)
62 | 
63 | ## License
64 | 
65 | This project is licensed under the MIT License - see the LICENSE file for details.
66 | 
67 | ## Contributing
68 | 
69 | Contributions are welcome! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
```
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
```markdown
 1 | # Contributing to Solr MCP
 2 | 
 3 | Thank you for your interest in contributing to the Solr MCP project! This document provides guidelines and instructions for contributing.
 4 | 
 5 | ## Getting Started
 6 | 
 7 | 1. Fork the repository on GitHub
 8 | 2. Clone your fork locally
 9 | 3. Set up the development environment as described in the README
10 | 4. Create a new branch for your changes
11 | 
12 | ## Development Workflow
13 | 
14 | 1. Make your changes in your branch
15 | 2. Write or update tests for your changes
16 | 3. Ensure all tests pass
17 | 4. Format your code using Black and isort
18 | 5. Submit a pull request
19 | 
20 | ## Code Style Guidelines
21 | 
22 | - Follow PEP 8 style guide with 88-char line length (Black formatter)
23 | - Use type hints consistently (Python 3.9+ typing)
24 | - Group imports: stdlib → third-party → local
25 | - Document functions, classes, and tools with docstrings
26 | 
27 | ## Testing
28 | 
29 | Run the test suite with:
30 | 
31 | ```bash
32 | poetry run pytest
33 | ```
34 | 
35 | For test coverage:
36 | 
37 | ```bash
38 | poetry run pytest --cov=solr_mcp
39 | ```
40 | 
41 | ## Submitting Pull Requests
42 | 
43 | 1. Update the README.md with details of changes if appropriate
44 | 2. Update the CHANGELOG.md following the Keep a Changelog format
45 | 3. The version will be updated according to Semantic Versioning by the maintainers
46 | 4. Once you have the sign-off of a maintainer, your PR will be merged
47 | 
48 | ## License
49 | 
50 | By contributing to this project, you agree that your contributions will be licensed under the project's MIT License.
```
--------------------------------------------------------------------------------
/CLAUDE.md:
--------------------------------------------------------------------------------
```markdown
 1 | # CLAUDE.md - Solr MCP Server Guide (Python)
 2 | 
 3 | ## IMPORTANT NOTE
 4 | Before using the search tools, make sure the Bitcoin whitepaper content is properly indexed in the unified collection!
 5 | If search queries like "double spend" return no results, you may need to reindex the content:
 6 | 
 7 | ```bash
 8 | python scripts/process_markdown.py data/bitcoin-whitepaper.md --output data/processed/bitcoin_sections.json
 9 | python scripts/unified_index.py data/processed/bitcoin_sections.json --collection unified
10 | ```
11 | 
12 | ## Project Structure
13 | - Python-based MCP server integrating with SolrCloud
14 | - Uses MCP 1.4.1 framework for protocol implementation
15 | - Provides document search and knowledge retrieval for AI systems
16 | - Supports SolrCloud collections and distributed search
17 | - Vector search/KNN capabilities for semantic search
18 | 
19 | ## Environment Setup
20 | - Python 3.10: `python3.10 -m venv venv`
21 | - Activate: `source venv/bin/activate` (Unix) or `venv\Scripts\activate` (Windows)
22 | - Install Poetry: `pip install poetry`
23 | 
24 | ## Build Commands
25 | - Install all deps: `poetry install`
26 | - Run server: `poetry run python -m solr_mcp.server`
27 | - Debug mode: `poetry run python -m solr_mcp.server --debug`
28 | - Package: `poetry build`
29 | 
30 | ## Test Commands
31 | - Run tests: `poetry run pytest`
32 | - Single test: `poetry run pytest tests/test_file.py::test_function`
33 | - Coverage: `poetry run pytest --cov=solr_mcp`
34 | - Lint: `poetry run flake8 solr_mcp tests`
35 | - Type check: `poetry run mypy solr_mcp tests`
36 | - Format code: `poetry run black solr_mcp tests`
37 | - Sort imports: `poetry run isort solr_mcp tests`
38 | - Run all formatting: `poetry run black solr_mcp tests && poetry run isort solr_mcp tests`
39 | 
40 | ## Docker Commands
41 | - Start SolrCloud: `docker-compose up -d`
42 | - Check logs: `docker-compose logs -f`
43 | - Solr UI: http://localhost:8983/solr/
44 | - Stop SolrCloud: `docker-compose down`
45 | - Cleanup volumes: `docker-compose down -v`
46 | 
47 | ## SolrCloud Integration
48 | - Connection via pysolr with ZooKeeper ensemble
49 | - Support for collection management and configuration
50 | - Handle distributed search with configurable shards and replicas
51 | - Vector search using dense_vector fields and KNN
52 | - Hybrid search combining keyword and vector search capabilities
53 | - Embedding generation via Ollama using nomic-embed-text (768D vectors)
54 | - Unified collections storing both text content and vector embeddings
55 | - Implement retry and fallback logic for resilience
56 | 
57 | ## Code Style Guidelines
58 | - Follow PEP 8 style guide with 88-char line length (Black formatter)
59 | - Use type hints consistently (Python 3.9+ typing)
60 | - Group imports: stdlib → third-party → local
61 | - Document functions, classes and tools with docstrings
62 | - Handle Solr connection errors with appropriate retries
63 | - Log operations with structured logging (JSON format)
64 | - Return well-formatted errors following JSON-RPC 2.0 spec
65 | 
66 | ## Technical Details
67 | 
68 | Key implementation details:
69 | 
70 | - Uses MCP 1.4.1 framework for protocol implementation
```
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
```python
1 | """Tests package."""
2 | 
```
--------------------------------------------------------------------------------
/tests/unit/__init__.py:
--------------------------------------------------------------------------------
```python
1 | """Unit tests package."""
2 | 
```
--------------------------------------------------------------------------------
/tests/integration/__init__.py:
--------------------------------------------------------------------------------
```python
1 | """Integration tests package."""
2 | 
```
--------------------------------------------------------------------------------
/solr_mcp/__init__.py:
--------------------------------------------------------------------------------
```python
1 | """Solr MCP Server - Model Context Protocol server for SolrCloud integration."""
2 | 
3 | __version__ = "0.1.0"
4 | 
```
--------------------------------------------------------------------------------
/tests/unit/fixtures/__init__.py:
--------------------------------------------------------------------------------
```python
1 | """Fixture package for unit tests.
2 | 
3 | This package contains various fixtures categorized by functionality.
4 | """
5 | 
```
--------------------------------------------------------------------------------
/solr_mcp/vector_provider/clients/__init__.py:
--------------------------------------------------------------------------------
```python
1 | """Vector provider client implementations."""
2 | 
3 | from .ollama import OllamaVectorProvider
4 | 
5 | __all__ = ["OllamaVectorProvider"]
6 | 
```
--------------------------------------------------------------------------------
/solr_mcp/solr/query/__init__.py:
--------------------------------------------------------------------------------
```python
1 | """Query building and validation package for SolrCloud client."""
2 | 
3 | from solr_mcp.solr.query.builder import QueryBuilder
4 | 
5 | __all__ = ["QueryBuilder"]
6 | 
```
--------------------------------------------------------------------------------
/solr_mcp/vector_provider/__init__.py:
--------------------------------------------------------------------------------
```python
1 | """Vector provider implementations."""
2 | 
3 | from solr_mcp.vector_provider.clients.ollama import OllamaVectorProvider
4 | 
5 | __all__ = ["OllamaVectorProvider"]
6 | 
```
--------------------------------------------------------------------------------
/solr_config/unified/conf/stopwords.txt:
--------------------------------------------------------------------------------
```
 1 | # Standard stop words
 2 | a
 3 | an
 4 | and
 5 | are
 6 | as
 7 | at
 8 | be
 9 | but
10 | by
11 | for
12 | if
13 | in
14 | into
15 | is
16 | it
17 | no
18 | not
19 | of
20 | on
21 | or
22 | such
23 | that
24 | the
25 | their
26 | then
27 | there
28 | these
29 | they
30 | this
31 | to
32 | was
33 | will
34 | with
```
--------------------------------------------------------------------------------
/solr_mcp/solr/schema/__init__.py:
--------------------------------------------------------------------------------
```python
1 | """Schema management package for SolrCloud client."""
2 | 
3 | from solr_mcp.solr.schema.cache import FieldCache
4 | from solr_mcp.solr.schema.fields import FieldManager
5 | 
6 | __all__ = ["FieldManager", "FieldCache"]
7 | 
```
--------------------------------------------------------------------------------
/solr_mcp/solr/utils/__init__.py:
--------------------------------------------------------------------------------
```python
 1 | """Solr utilities package."""
 2 | 
 3 | from solr_mcp.solr.utils.formatting import (
 4 |     format_error_response,
 5 |     format_search_results,
 6 |     format_sql_response,
 7 | )
 8 | 
 9 | __all__ = ["format_search_results", "format_sql_response", "format_error_response"]
10 | 
```
--------------------------------------------------------------------------------
/solr_mcp/solr/vector/__init__.py:
--------------------------------------------------------------------------------
```python
1 | """Vector search functionality."""
2 | 
3 | from solr_mcp.solr.vector.manager import VectorManager
4 | from solr_mcp.solr.vector.results import VectorSearchResult, VectorSearchResults
5 | 
6 | __all__ = ["VectorManager", "VectorSearchResult", "VectorSearchResults"]
7 | 
```
--------------------------------------------------------------------------------
/solr_config/unified/conf/synonyms.txt:
--------------------------------------------------------------------------------
```
 1 | # Synonym mappings
 2 | bitcoin, btc
 3 | blockchain, distributed ledger
 4 | cryptocurrency, crypto
 5 | double spend, double spending, double-spend, double-spending, doublespend, doublespending
 6 | consensus, agreement
 7 | transaction, tx
 8 | block, blocks
 9 | mining, miner, miners
10 | peer to peer, peer-to-peer, p2p
11 | cryptographic, cryptography, crypto
12 | distributed ledger, blockchain
13 | proof of work, pow
14 | hash, hashing
```
--------------------------------------------------------------------------------
/solr_mcp/tools/solr_list_collections.py:
--------------------------------------------------------------------------------
```python
 1 | """Tool for listing Solr collections."""
 2 | 
 3 | from typing import Dict, List
 4 | 
 5 | from solr_mcp.tools.tool_decorator import tool
 6 | 
 7 | 
 8 | @tool()
 9 | async def execute_list_collections(mcp) -> List[str]:
10 |     """List all available Solr collections.
11 | 
12 |     Lists all collections available in the Solr cluster.
13 | 
14 |     Args:
15 |         mcp: SolrMCPServer instance
16 | 
17 |     Returns:
18 |         List of collection names
19 |     """
20 |     solr_client = mcp.solr_client
21 |     return await solr_client.list_collections()
22 | 
```
--------------------------------------------------------------------------------
/solr_mcp/vector_provider/exceptions.py:
--------------------------------------------------------------------------------
```python
 1 | """Exceptions for vector provider module."""
 2 | 
 3 | 
 4 | class VectorError(Exception):
 5 |     """Base exception for vector-related errors."""
 6 | 
 7 |     pass
 8 | 
 9 | 
10 | class VectorGenerationError(VectorError):
11 |     """Raised when vector generation fails."""
12 | 
13 |     pass
14 | 
15 | 
16 | class VectorConfigError(VectorError):
17 |     """Raised when there is an error in vector provider configuration."""
18 | 
19 |     pass
20 | 
21 | 
22 | class VectorConnectionError(VectorError):
23 |     """Raised when connection to vector service fails."""
24 | 
25 |     pass
26 | 
```
--------------------------------------------------------------------------------
/solr_mcp/solr/__init__.py:
--------------------------------------------------------------------------------
```python
 1 | """SolrCloud client package."""
 2 | 
 3 | from solr_mcp.solr.client import SolrClient
 4 | from solr_mcp.solr.config import SolrConfig
 5 | from solr_mcp.solr.constants import FIELD_TYPE_MAPPING, SYNTHETIC_SORT_FIELDS
 6 | from solr_mcp.solr.exceptions import (
 7 |     ConfigurationError,
 8 |     ConnectionError,
 9 |     QueryError,
10 |     SchemaError,
11 |     SolrError,
12 | )
13 | 
14 | __all__ = [
15 |     "SolrConfig",
16 |     "SolrClient",
17 |     "SolrError",
18 |     "ConfigurationError",
19 |     "ConnectionError",
20 |     "QueryError",
21 |     "SchemaError",
22 |     "FIELD_TYPE_MAPPING",
23 |     "SYNTHETIC_SORT_FIELDS",
24 | ]
25 | 
```
--------------------------------------------------------------------------------
/solr_mcp/vector_provider/constants.py:
--------------------------------------------------------------------------------
```python
 1 | """Constants for vector module."""
 2 | 
 3 | from typing import Any, Dict
 4 | 
 5 | # Default configuration for vector providers
 6 | DEFAULT_OLLAMA_CONFIG: Dict[str, Any] = {
 7 |     "base_url": "http://localhost:11434",
 8 |     "model": "nomic-embed-text",
 9 |     "timeout": 30,  # seconds
10 |     "retries": 3,
11 | }
12 | 
13 | # Environment variable names
14 | ENV_OLLAMA_BASE_URL = "OLLAMA_BASE_URL"
15 | ENV_OLLAMA_MODEL = "OLLAMA_MODEL"
16 | 
17 | # HTTP endpoints
18 | OLLAMA_EMBEDDINGS_PATH = "/api/embeddings"
19 | 
20 | # Model-specific constants
21 | MODEL_DIMENSIONS = {"nomic-embed-text": 768}  # 768-dimensional vectors
22 | 
```
--------------------------------------------------------------------------------
/tests/unit/conftest.py:
--------------------------------------------------------------------------------
```python
 1 | """Test configuration and fixtures.
 2 | 
 3 | This module imports and re-exports all fixtures from the fixtures directory,
 4 | making them available to all tests without explicit imports.
 5 | """
 6 | 
 7 | # Import and expose all fixtures
 8 | from unittest.mock import mock_open
 9 | 
10 | from .fixtures.common import *
11 | from .fixtures.config_fixtures import *
12 | from .fixtures.http_fixtures import *
13 | from .fixtures.server_fixtures import *
14 | from .fixtures.solr_fixtures import *
15 | from .fixtures.time_fixtures import *
16 | from .fixtures.vector_fixtures import *
17 | from .fixtures.zookeeper_fixtures import *
18 | 
```
--------------------------------------------------------------------------------
/tests/unit/fixtures/time_fixtures.py:
--------------------------------------------------------------------------------
```python
 1 | """Time-related fixtures for unit tests."""
 2 | 
 3 | import time
 4 | from unittest.mock import patch
 5 | 
 6 | import pytest
 7 | 
 8 | 
 9 | @pytest.fixture
10 | def mock_time(request):
11 |     """Parameterized time.time mock.
12 | 
13 |     Args:
14 |         request: Pytest request object that can contain parameters:
15 |             - minutes_offset: Minutes to add to current time
16 |     """
17 |     # Get parameters or use defaults
18 |     minutes_offset = getattr(request, "param", {}).get("minutes_offset", 0)
19 |     seconds_offset = minutes_offset * 60
20 | 
21 |     with patch("time.time", return_value=time.time() + seconds_offset) as mock:
22 |         yield mock
23 | 
```
--------------------------------------------------------------------------------
/solr_mcp/solr/constants.py:
--------------------------------------------------------------------------------
```python
 1 | """Constants for SolrCloud client."""
 2 | 
 3 | # Field type mapping for sorting
 4 | FIELD_TYPE_MAPPING = {
 5 |     "string": "string",
 6 |     "text_general": "text",
 7 |     "text_en": "text",
 8 |     "int": "numeric",
 9 |     "long": "numeric",
10 |     "float": "numeric",
11 |     "double": "numeric",
12 |     "date": "date",
13 |     "boolean": "boolean",
14 | }
15 | 
16 | # Synthetic fields that can be used for sorting
17 | SYNTHETIC_SORT_FIELDS = {
18 |     "score": {
19 |         "type": "numeric",
20 |         "directions": ["asc", "desc"],
21 |         "default_direction": "desc",
22 |         "searchable": True,
23 |     },
24 |     "_docid_": {
25 |         "type": "numeric",
26 |         "directions": ["asc", "desc"],
27 |         "default_direction": "asc",
28 |         "searchable": False,
29 |         "warning": "Internal Lucene document ID. Not stable across restarts or reindexing.",
30 |     },
31 | }
32 | 
```
--------------------------------------------------------------------------------
/solr_mcp/tools/__init__.py:
--------------------------------------------------------------------------------
```python
 1 | """Tool definitions for Solr MCP server."""
 2 | 
 3 | import inspect
 4 | import sys
 5 | 
 6 | from .solr_default_vectorizer import get_default_text_vectorizer
 7 | from .solr_list_collections import execute_list_collections
 8 | from .solr_list_fields import execute_list_fields
 9 | from .solr_select import execute_select_query
10 | from .solr_semantic_select import execute_semantic_select_query
11 | from .solr_vector_select import execute_vector_select_query
12 | from .tool_decorator import get_schema, tool
13 | 
14 | __all__ = [
15 |     "execute_list_collections",
16 |     "execute_list_fields",
17 |     "execute_select_query",
18 |     "execute_vector_select_query",
19 |     "execute_semantic_select_query",
20 |     "get_default_text_vectorizer",
21 | ]
22 | 
23 | TOOLS_DEFINITION = [
24 |     obj
25 |     for name, obj in inspect.getmembers(sys.modules[__name__])
26 |     if inspect.isfunction(obj) and hasattr(obj, "_is_tool") and obj._is_tool
27 | ]
28 | 
```
--------------------------------------------------------------------------------
/scripts/format.py:
--------------------------------------------------------------------------------
```python
 1 | #!/usr/bin/env python3
 2 | """
 3 | Format script to run all code formatters on the project.
 4 | """
 5 | 
 6 | import subprocess
 7 | import sys
 8 | from typing import List
 9 | 
10 | 
11 | def run_command(command: List[str]) -> bool:
12 |     """Run a command and return True if successful, False otherwise."""
13 |     print(f"Running: {' '.join(command)}")
14 |     result = subprocess.run(command, capture_output=True, text=True)
15 |     
16 |     if result.returncode != 0:
17 |         print(f"Command failed with exit code {result.returncode}")
18 |         print(result.stdout)
19 |         print(result.stderr)
20 |         return False
21 |     
22 |     print(result.stdout)
23 |     return True
24 | 
25 | 
26 | def main() -> int:
27 |     """Run all code formatters."""
28 |     print("Running code formatters...")
29 |     
30 |     success = True
31 |     
32 |     # Run black
33 |     if not run_command(["black", "solr_mcp", "tests"]):
34 |         success = False
35 |     
36 |     # Run isort
37 |     if not run_command(["isort", "solr_mcp", "tests"]):
38 |         success = False
39 |     
40 |     if success:
41 |         print("All formatting completed successfully!")
42 |         return 0
43 |     else:
44 |         print("Some formatting commands failed.")
45 |         return 1
46 | 
47 | 
48 | if __name__ == "__main__":
49 |     sys.exit(main())
```
--------------------------------------------------------------------------------
/scripts/lint.py:
--------------------------------------------------------------------------------
```python
 1 | #!/usr/bin/env python3
 2 | """
 3 | Lint script to run all linting tools on the project.
 4 | """
 5 | 
 6 | import subprocess
 7 | import sys
 8 | from typing import List
 9 | 
10 | 
11 | def run_command(command: List[str]) -> bool:
12 |     """Run a command and return True if successful, False otherwise."""
13 |     print(f"Running: {' '.join(command)}")
14 |     result = subprocess.run(command, capture_output=True, text=True)
15 |     
16 |     if result.returncode != 0:
17 |         print(f"Command failed with exit code {result.returncode}")
18 |         print(result.stdout)
19 |         print(result.stderr)
20 |         return False
21 |     
22 |     print(result.stdout)
23 |     return True
24 | 
25 | 
26 | def main() -> int:
27 |     """Run all linting tools."""
28 |     print("Running full linting checks...")
29 |     
30 |     success = True
31 |     
32 |     # Run flake8 with all checks
33 |     if not run_command(["flake8", "solr_mcp", "tests"]):
34 |         success = False
35 |     
36 |     # Run mypy type checking
37 |     if not run_command(["mypy", "solr_mcp", "tests"]):
38 |         success = False
39 |     
40 |     if success:
41 |         print("All linting checks passed!")
42 |         return 0
43 |     else:
44 |         print("Some linting checks failed.")
45 |         return 1
46 | 
47 | 
48 | if __name__ == "__main__":
49 |     sys.exit(main())
```
--------------------------------------------------------------------------------
/solr_mcp/tools/base.py:
--------------------------------------------------------------------------------
```python
 1 | """Base tool definitions and decorators."""
 2 | 
 3 | from functools import wraps
 4 | from typing import Any, Callable, Dict, List, Optional, Union
 5 | 
 6 | 
 7 | def tool(
 8 |     name: Optional[str] = None,
 9 |     description: Optional[str] = None,
10 |     parameters: Optional[Dict[str, Any]] = None,
11 | ) -> Callable:
12 |     """Decorator to mark a function as an MCP tool.
13 | 
14 |     Args:
15 |         name: Tool name. Defaults to function name if not provided.
16 |         description: Tool description. Defaults to function docstring if not provided.
17 |         parameters: Tool parameters. Defaults to function parameters if not provided.
18 |     """
19 | 
20 |     def decorator(func: Callable) -> Callable:
21 |         @wraps(func)
22 |         def wrapper(*args, **kwargs) -> List[Dict[str, str]]:
23 |             result = func(*args, **kwargs)
24 |             if not isinstance(result, list):
25 |                 result = [{"type": "text", "text": str(result)}]
26 |             return result
27 | 
28 |         # Mark as tool
29 |         wrapper._is_tool = True
30 | 
31 |         # Set tool metadata
32 |         wrapper._tool_name = name or func.__name__
33 |         wrapper._tool_description = description or func.__doc__ or ""
34 |         wrapper._tool_parameters = parameters or {}
35 | 
36 |         return wrapper
37 | 
38 |     return decorator
39 | 
```
--------------------------------------------------------------------------------
/tests/unit/tools/test_init.py:
--------------------------------------------------------------------------------
```python
 1 | """Test tools initialization."""
 2 | 
 3 | import pytest
 4 | 
 5 | from solr_mcp.tools import (
 6 |     TOOLS_DEFINITION,
 7 |     execute_list_collections,
 8 |     execute_list_fields,
 9 |     execute_select_query,
10 |     execute_semantic_select_query,
11 |     execute_vector_select_query,
12 |     get_default_text_vectorizer,
13 | )
14 | 
15 | 
16 | def test_tools_definition():
17 |     """Test that TOOLS_DEFINITION contains all expected tools."""
18 |     # All tools should be in TOOLS_DEFINITION
19 |     tools = {
20 |         "solr_list_collections": execute_list_collections,
21 |         "solr_list_fields": execute_list_fields,
22 |         "solr_select": execute_select_query,
23 |         "solr_vector_select": execute_vector_select_query,
24 |         "solr_semantic_select": execute_semantic_select_query,
25 |         "get_default_text_vectorizer": get_default_text_vectorizer,
26 |     }
27 | 
28 |     assert len(TOOLS_DEFINITION) == len(tools)
29 | 
30 |     for tool_name, tool_func in tools.items():
31 |         assert tool_func in TOOLS_DEFINITION
32 | 
33 | 
34 | def test_tools_exports():
35 |     """Test that __all__ exports all tools."""
36 |     from solr_mcp.tools import __all__
37 | 
38 |     expected = {
39 |         "execute_list_collections",
40 |         "execute_list_fields",
41 |         "execute_select_query",
42 |         "execute_vector_select_query",
43 |         "execute_semantic_select_query",
44 |         "get_default_text_vectorizer",
45 |     }
46 | 
47 |     assert set(__all__) == expected
48 | 
```
--------------------------------------------------------------------------------
/tests/unit/vector_provider/test_constants.py:
--------------------------------------------------------------------------------
```python
 1 | """Tests for vector provider constants."""
 2 | 
 3 | from solr_mcp.vector_provider.constants import (
 4 |     DEFAULT_OLLAMA_CONFIG,
 5 |     ENV_OLLAMA_BASE_URL,
 6 |     ENV_OLLAMA_MODEL,
 7 |     MODEL_DIMENSIONS,
 8 |     OLLAMA_EMBEDDINGS_PATH,
 9 | )
10 | 
11 | 
12 | def test_default_ollama_config():
13 |     """Test default Ollama configuration values."""
14 |     assert isinstance(DEFAULT_OLLAMA_CONFIG, dict)
15 |     assert "base_url" in DEFAULT_OLLAMA_CONFIG
16 |     assert "model" in DEFAULT_OLLAMA_CONFIG
17 |     assert "timeout" in DEFAULT_OLLAMA_CONFIG
18 |     assert "retries" in DEFAULT_OLLAMA_CONFIG
19 | 
20 |     assert DEFAULT_OLLAMA_CONFIG["base_url"] == "http://localhost:11434"
21 |     assert DEFAULT_OLLAMA_CONFIG["model"] == "nomic-embed-text"
22 |     assert DEFAULT_OLLAMA_CONFIG["timeout"] == 30
23 |     assert DEFAULT_OLLAMA_CONFIG["retries"] == 3
24 | 
25 | 
26 | def test_environment_variables():
27 |     """Test environment variable names."""
28 |     assert ENV_OLLAMA_BASE_URL == "OLLAMA_BASE_URL"
29 |     assert ENV_OLLAMA_MODEL == "OLLAMA_MODEL"
30 | 
31 | 
32 | def test_api_endpoints():
33 |     """Test API endpoint paths."""
34 |     assert OLLAMA_EMBEDDINGS_PATH == "/api/embeddings"
35 | 
36 | 
37 | def test_model_dimensions():
38 |     """Test model dimension mappings."""
39 |     assert isinstance(MODEL_DIMENSIONS, dict)
40 |     assert "nomic-embed-text" in MODEL_DIMENSIONS
41 |     assert MODEL_DIMENSIONS["nomic-embed-text"] == 768  # 768-dimensional embeddings
42 | 
```
--------------------------------------------------------------------------------
/solr.Dockerfile:
--------------------------------------------------------------------------------
```dockerfile
 1 | FROM solr:9.5
 2 | 
 3 | USER root
 4 | 
 5 | # Install SQL dependencies
 6 | RUN apt-get update && \
 7 |     apt-get install -y wget unzip && \
 8 |     mkdir -p /opt/solr/contrib/sql && \
 9 |     cd /opt/solr/contrib/sql && \
10 |     wget https://repo1.maven.org/maven2/org/apache/solr/solr-sql/9.5.0/solr-sql-9.5.0.jar && \
11 |     wget https://repo1.maven.org/maven2/org/apache/calcite/calcite-core/1.35.0/calcite-core-1.35.0.jar && \
12 |     wget https://repo1.maven.org/maven2/org/apache/calcite/calcite-linq4j/1.35.0/calcite-linq4j-1.35.0.jar && \
13 |     wget https://repo1.maven.org/maven2/org/apache/calcite/avatica/avatica-core/1.23.0/avatica-core-1.23.0.jar && \
14 |     wget https://repo1.maven.org/maven2/com/google/protobuf/protobuf-java/3.21.7/protobuf-java-3.21.7.jar && \
15 |     wget https://repo1.maven.org/maven2/org/apache/calcite/avatica/avatica-metrics/1.23.0/avatica-metrics-1.23.0.jar && \
16 |     wget https://repo1.maven.org/maven2/org/locationtech/jts/jts-core/1.19.0/jts-core-1.19.0.jar && \
17 |     wget https://repo1.maven.org/maven2/org/codehaus/janino/janino/3.1.9/janino-3.1.9.jar && \
18 |     wget https://repo1.maven.org/maven2/org/codehaus/janino/commons-compiler/3.1.9/commons-compiler-3.1.9.jar && \
19 |     cp *.jar /opt/solr/server/solr-webapp/webapp/WEB-INF/lib/ && \
20 |     chown -R solr:solr /opt/solr/contrib/sql /opt/solr/server/solr-webapp/webapp/WEB-INF/lib/*.jar
21 | 
22 | USER solr 
```
--------------------------------------------------------------------------------
/tests/unit/vector_provider/test_exceptions.py:
--------------------------------------------------------------------------------
```python
 1 | """Tests for vector provider exceptions."""
 2 | 
 3 | from solr_mcp.vector_provider.exceptions import (
 4 |     VectorConfigError,
 5 |     VectorConnectionError,
 6 |     VectorError,
 7 |     VectorGenerationError,
 8 | )
 9 | 
10 | 
11 | def test_vector_error():
12 |     """Test base VectorError exception."""
13 |     error = VectorError("Test error")
14 |     assert str(error) == "Test error"
15 |     assert isinstance(error, Exception)
16 | 
17 | 
18 | def test_vector_generation_error():
19 |     """Test VectorGenerationError exception."""
20 |     error = VectorGenerationError("Generation failed")
21 |     assert str(error) == "Generation failed"
22 |     assert isinstance(error, VectorError)
23 |     assert isinstance(error, Exception)
24 | 
25 | 
26 | def test_vector_config_error():
27 |     """Test VectorConfigError exception."""
28 |     error = VectorConfigError("Invalid config")
29 |     assert str(error) == "Invalid config"
30 |     assert isinstance(error, VectorError)
31 |     assert isinstance(error, Exception)
32 | 
33 | 
34 | def test_vector_connection_error():
35 |     """Test VectorConnectionError exception."""
36 |     error = VectorConnectionError("Connection failed")
37 |     assert str(error) == "Connection failed"
38 |     assert isinstance(error, VectorError)
39 |     assert isinstance(error, Exception)
40 | 
41 | 
42 | def test_error_inheritance():
43 |     """Test exception inheritance hierarchy."""
44 |     assert issubclass(VectorGenerationError, VectorError)
45 |     assert issubclass(VectorConfigError, VectorError)
46 |     assert issubclass(VectorConnectionError, VectorError)
47 |     assert issubclass(VectorError, Exception)
48 | 
```
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
```markdown
 1 | # Changelog
 2 | 
 3 | All notable changes to this project will be documented in this file.
 4 | 
 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 7 | 
 8 | ## [Unreleased]
 9 | 
10 | ### Added
11 | - Initial project structure
12 | - MCP server implementation
13 | - Solr client with search, vector search, and hybrid search capabilities
14 | - Embedding generation via Ollama using nomic-embed-text
15 | - Docker configuration for SolrCloud and ZooKeeper
16 | - Demo scripts and utilities for testing
17 | - Bitcoin whitepaper as sample document
18 | - Documentation (README, QUICKSTART, CONTRIBUTING)
19 | 
20 | ### Fixed
21 | - Improved search query transformation for better results
22 | - Fixed phrase proximity searches with `~5` operator
23 | - Proper field naming for Solr compatibility
24 | - Enhanced text analysis for hyphenated terms like "double-spending"
25 | - Improved synonym handling in Solr configuration
26 | - Fixed vector search configuration to use built-in capabilities
27 | - Improved error handling in Ollama embedding client with retries
28 | - Added proper timeout and fallback mechanisms for embedding generation
29 | - Fixed Solr schema URL paths in client implementation
30 | - Enhanced Docker healthcheck for Ollama service
31 | 
32 | ### Changed
33 | - Migrated from FastMCP to MCP 1.4.1
34 | 
35 | ## [0.1.0] - 2024-03-17
36 | ### Added
37 | - Initial release
38 | - MCP server implementation
39 | - Integration with SolrCloud
40 | - Support for basic search operations
41 | - Vector search capabilities
42 | - Hybrid search functionality
43 | - Embedding generation and indexing
```
--------------------------------------------------------------------------------
/solr_mcp/tools/solr_select.py:
--------------------------------------------------------------------------------
```python
 1 | """Tool for executing SQL SELECT queries against Solr."""
 2 | 
 3 | from typing import Dict
 4 | 
 5 | from solr_mcp.tools.tool_decorator import tool
 6 | 
 7 | 
 8 | @tool()
 9 | async def execute_select_query(mcp, query: str) -> Dict:
10 |     """Execute SQL queries against Solr collections.
11 | 
12 |     Executes SQL queries against Solr collections with the following Solr-specific behaviors:
13 | 
14 |     Collection/Field Rules:
15 |     - Collections are used as table names (case-insensitive)
16 |     - Field names are case-sensitive and must exist in Solr schema
17 |     - SELECT * only allowed with LIMIT clause
18 |     - Unlimited queries restricted to docValues-enabled fields
19 |     - Reserved words must be backtick-escaped
20 | 
21 |     WHERE Clause Differences:
22 |     - Field must be on one side of predicate
23 |     - No comparing two constants or two fields
24 |     - No subqueries
25 |     - Solr syntax in values:
26 |       - '[0 TO 100]' for ranges
27 |       - '(term1 term2)' for non-phrase OR search
28 |     - String literals use single-quotes
29 | 
30 |     Supported Features:
31 |     - Operators: =, <>, >, >=, <, <=, IN, LIKE (wildcards), BETWEEN, IS [NOT] NULL
32 |     - Functions: COUNT(*), COUNT(DISTINCT), MIN, MAX, SUM, AVG
33 |     - GROUP BY: Uses faceting (fast) for low cardinality, map_reduce (slow) for high cardinality
34 |     - ORDER BY: Requires docValues-enabled fields
35 |     - LIMIT/OFFSET: Use 'OFFSET x FETCH NEXT y ROWS ONLY' syntax
36 |       - Performance of OFFSET degrades beyond 10k docs per shard
37 | 
38 |     Args:
39 |         mcp: SolrMCPServer instance
40 |         query: SQL query to execute
41 | 
42 |     Returns:
43 |         Query results
44 |     """
45 |     solr_client = mcp.solr_client
46 |     return await solr_client.execute_select_query(query)
47 | 
```
--------------------------------------------------------------------------------
/solr_mcp/tools/solr_vector_select.py:
--------------------------------------------------------------------------------
```python
 1 | """Tool for executing vector search queries against Solr collections."""
 2 | 
 3 | from typing import Dict, List, Optional
 4 | 
 5 | from solr_mcp.tools.tool_decorator import tool
 6 | 
 7 | 
 8 | @tool()
 9 | async def execute_vector_select_query(
10 |     mcp, query: str, vector: List[float], field: Optional[str] = None
11 | ) -> Dict:
12 |     """Execute vector search queries against Solr collections.
13 | 
14 |     Extends solr_select tool with vector search capabilities.
15 | 
16 |     Additional Parameters:
17 |     - vector: Used to match against the collection's vector field, intended for vector search.
18 |     - field: Name of the vector field to search against (optional, will auto-detect if not specified)
19 | 
20 |     The query results will be ranked based on distance to the provided vector. Therefore, ORDER BY is not allowed.
21 | 
22 |     Collection/Field Rules:
23 |     - Vector field must be a dense_vector or knn_vector field type
24 |     - The specified field must exist in the collection schema
25 |     - The input vector dimensionality must match the field's vector dimensionality
26 | 
27 |     Supported Features:
28 |     - All standard SELECT query features except ORDER BY
29 |       - Results are ordered by vector distance
30 |     - Hybrid search combining keyword (SQL WHERE clauses) and vector distance (vector parameter)
31 | 
32 |     Args:
33 |         mcp: SolrMCPServer instance
34 |         query: SQL query to execute
35 |         vector: Query vector for similarity search
36 |         field: Name of the vector field to search against (optional, auto-detected if not specified)
37 | 
38 |     Returns:
39 |         Query results
40 |     """
41 |     solr_client = mcp.solr_client
42 |     return await solr_client.execute_vector_select_query(query, vector, field)
43 | 
```
--------------------------------------------------------------------------------
/solr_mcp/solr/response.py:
--------------------------------------------------------------------------------
```python
 1 | """Response formatters for Solr results."""
 2 | 
 3 | import logging
 4 | from typing import Any, Dict, List, Optional, Union
 5 | 
 6 | import pysolr
 7 | from loguru import logger
 8 | 
 9 | from solr_mcp.solr.utils.formatting import format_search_results, format_sql_response
10 | 
11 | logger = logging.getLogger(__name__)
12 | 
13 | 
14 | class ResponseFormatter:
15 |     """Formats Solr responses for client consumption."""
16 | 
17 |     @staticmethod
18 |     def format_search_results(
19 |         results: pysolr.Results, start: int = 0
20 |     ) -> Dict[str, Any]:
21 |         """Format Solr search results for client consumption.
22 | 
23 |         Args:
24 |             results: Solr search results
25 |             start: Starting index of results
26 | 
27 |         Returns:
28 |             Formatted search results
29 |         """
30 |         return format_search_results(results, start)
31 | 
32 |     @staticmethod
33 |     def format_sql_response(response: Dict[str, Any]) -> Dict[str, Any]:
34 |         """Format Solr SQL response for client consumption.
35 | 
36 |         Args:
37 |             response: Solr SQL response
38 | 
39 |         Returns:
40 |             Formatted SQL response
41 |         """
42 |         return format_sql_response(response)
43 | 
44 |     @staticmethod
45 |     def format_vector_search_results(
46 |         results: Dict[str, Any], top_k: int
47 |     ) -> Dict[str, Any]:
48 |         """Format vector search results.
49 | 
50 |         Args:
51 |             results: Vector search results
52 |             top_k: Number of top results
53 | 
54 |         Returns:
55 |             Formatted vector search results
56 |         """
57 |         from solr_mcp.solr.vector import VectorSearchResults
58 | 
59 |         vector_results = VectorSearchResults.from_solr_response(
60 |             response=results, top_k=top_k
61 |         )
62 |         return vector_results.to_dict()
63 | 
```
--------------------------------------------------------------------------------
/scripts/simple_mcp_test.py:
--------------------------------------------------------------------------------
```python
 1 | #!/usr/bin/env python3
 2 | """
 3 | Simple MCP client test script.
 4 | """
 5 | 
 6 | import sys
 7 | import os
 8 | import json
 9 | import asyncio
10 | import httpx
11 | 
12 | # Add the project root to your path
13 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
14 | 
15 | from solr_mcp.solr.client import SolrClient
16 | 
17 | async def direct_solr_test():
18 |     """Test direct Solr connection."""
19 |     client = SolrClient()
20 |     
21 |     # Test standard search with different query formats
22 |     print("\n=== Testing direct Solr client search with different query formats ===")
23 |     results1 = await client.search("double spend", collection="unified")
24 |     print(f"Simple search results: {results1}")
25 |     
26 |     results2 = await client.search("content:double content:spend", collection="unified")
27 |     print(f"Field-specific search results: {results2}")
28 |     
29 |     results3 = await client.search("content:\"double spend\"~5", collection="unified")
30 |     print(f"Phrase search results: {results3}")
31 |     
32 |     # Test with HTTP client
33 |     print("\n=== Testing direct HTTP search ===")
34 |     async with httpx.AsyncClient() as http_client:
35 |         response = await http_client.get(
36 |             'http://localhost:8983/solr/unified/select',
37 |             params={
38 |                 'q': 'content:"double spend"~5',
39 |                 'wt': 'json'
40 |             }
41 |         )
42 |         print(f"HTTP search results: {response.text}")
43 |     
44 |     # Check solr config details
45 |     print("\n=== Solr client configuration ===")
46 |     print(f"Default collection: {client.config.default_collection}")
47 |     print(f"Collections available: {client.list_collections()}")
48 | 
49 | async def main():
50 |     await direct_solr_test()
51 | 
52 | if __name__ == "__main__":
53 |     asyncio.run(main())
```
--------------------------------------------------------------------------------
/solr_mcp/tools/solr_list_fields.py:
--------------------------------------------------------------------------------
```python
 1 | """Tool for listing fields in a Solr collection."""
 2 | 
 3 | from typing import Any, Dict
 4 | 
 5 | from solr_mcp.tools.tool_decorator import tool
 6 | 
 7 | 
 8 | @tool()
 9 | async def execute_list_fields(mcp: str, collection: str) -> Dict[str, Any]:
10 |     """List all fields in a Solr collection.
11 | 
12 |     This tool provides detailed information about each field in a Solr collection,
13 |     including how fields are related through copyField directives. Pay special
14 |     attention to fields that have 'copies_from' data - these are aggregate fields
15 |     that combine content from multiple source fields.
16 | 
17 |     For example, the '_text_' field is typically an aggregate field that combines
18 |     content from many text fields to provide a unified search experience. When you
19 |     see a field with 'copies_from' data, it means that field contains a copy of
20 |     the content from all the listed source fields.
21 | 
22 |     Args:
23 |         mcp: MCP instance name
24 |         collection: Name of the collection to get fields from
25 | 
26 |     Returns:
27 |         Dictionary containing:
28 |         - fields: List of field definitions with their properties including:
29 |             - name: Field name
30 |             - type: Field type (text_general, string, etc)
31 |             - indexed: Whether the field is indexed for searching
32 |             - stored: Whether the field values are stored
33 |             - docValues: Whether the field can be used for sorting/faceting
34 |             - multiValued: Whether the field can contain multiple values
35 |             - copies_from: List of source fields that copy their content to this field
36 |         - collection: Name of the collection queried
37 |     """
38 |     fields = await mcp.solr_client.list_fields(collection)
39 | 
40 |     return {"fields": fields, "collection": collection}
41 | 
```
--------------------------------------------------------------------------------
/solr_mcp/vector_provider/interfaces.py:
--------------------------------------------------------------------------------
```python
 1 | """Interfaces for vector providers."""
 2 | 
 3 | from abc import ABC, abstractmethod
 4 | from typing import Any, Dict, List, Optional
 5 | 
 6 | 
 7 | class VectorProvider(ABC):
 8 |     """Interface for generating vectors for semantic search."""
 9 | 
10 |     @abstractmethod
11 |     async def get_vector(self, text: str, model: Optional[str] = None) -> List[float]:
12 |         """Get vector for a single text.
13 | 
14 |         Args:
15 |             text: Text to generate vector for
16 |             model: Optional model name to use (overrides default)
17 | 
18 |         Returns:
19 |             List of floats representing the vector
20 | 
21 |         Raises:
22 |             VectorGenerationError: If vector generation fails
23 |             VectorConnectionError: If connection to service fails
24 |         """
25 |         pass
26 | 
27 |     @abstractmethod
28 |     async def get_vectors(
29 |         self, texts: List[str], model: Optional[str] = None
30 |     ) -> List[List[float]]:
31 |         """Get vectors for multiple texts.
32 | 
33 |         Args:
34 |             texts: List of texts to generate vectors for
35 |             model: Optional model name to use (overrides default)
36 | 
37 |         Returns:
38 |             List of vectors (list of floats)
39 | 
40 |         Raises:
41 |             VectorGenerationError: If vector generation fails
42 |             VectorConnectionError: If connection to service fails
43 |         """
44 |         pass
45 | 
46 |     @property
47 |     @abstractmethod
48 |     def vector_dimension(self) -> int:
49 |         """Get the dimension of vectors produced by this provider.
50 | 
51 |         Returns:
52 |             Integer dimension of the vectors
53 | 
54 |         Raises:
55 |             VectorConfigError: If unable to determine vector dimension
56 |         """
57 |         pass
58 | 
59 |     @property
60 |     @abstractmethod
61 |     def model_name(self) -> str:
62 |         """Get the name of the model used by this provider.
63 | 
64 |         Returns:
65 |             String name of the model
66 |         """
67 |         pass
68 | 
```
--------------------------------------------------------------------------------
/scripts/simple_index.py:
--------------------------------------------------------------------------------
```python
 1 | #!/usr/bin/env python3
 2 | """
 3 | Simple indexing script to demonstrate adding documents to Solr without embeddings.
 4 | """
 5 | 
 6 | import argparse
 7 | import json
 8 | import os
 9 | import sys
10 | import time
11 | import pysolr
12 | from typing import Dict, List, Any
13 | 
14 | # Add the project root to the path
15 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
16 | 
17 | 
18 | def index_documents(json_file: str, collection: str = "documents", commit: bool = True):
19 |     """
20 |     Index documents from a JSON file into Solr without vector embeddings.
21 |     
22 |     Args:
23 |         json_file: Path to the JSON file containing documents
24 |         collection: Solr collection name
25 |         commit: Whether to commit after indexing
26 |     """
27 |     # Load documents
28 |     with open(json_file, 'r', encoding='utf-8') as f:
29 |         documents = json.load(f)
30 |     
31 |     # Initialize Solr client directly
32 |     solr_url = f"http://localhost:8983/solr/{collection}"
33 |     solr = pysolr.Solr(solr_url, always_commit=commit)
34 |     
35 |     print(f"Indexing {len(documents)} documents to {collection} collection...")
36 |     
37 |     try:
38 |         # Add documents to Solr
39 |         solr.add(documents)
40 |         print(f"Successfully indexed {len(documents)} documents in collection '{collection}'")
41 |     except Exception as e:
42 |         print(f"Error indexing documents: {e}")
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     parser = argparse.ArgumentParser(description="Index documents in Solr without vector embeddings")
47 |     parser.add_argument("json_file", help="Path to the JSON file containing documents")
48 |     parser.add_argument("--collection", "-c", default="documents", help="Solr collection name")
49 |     parser.add_argument("--no-commit", dest="commit", action="store_false", help="Don't commit after indexing")
50 |     
51 |     args = parser.parse_args()
52 |     index_documents(args.json_file, args.collection, args.commit)
```
--------------------------------------------------------------------------------
/solr_mcp/solr/interfaces.py:
--------------------------------------------------------------------------------
```python
 1 | """Interfaces for Solr client components."""
 2 | 
 3 | from abc import ABC, abstractmethod
 4 | from typing import Any, Dict, List, Optional
 5 | 
 6 | 
 7 | class CollectionProvider(ABC):
 8 |     """Interface for providing collection information."""
 9 | 
10 |     @abstractmethod
11 |     async def list_collections(self) -> List[str]:
12 |         """List all available collections.
13 | 
14 |         Returns:
15 |             List of collection names
16 | 
17 |         Raises:
18 |             ConnectionError: If unable to retrieve collections
19 |         """
20 |         pass
21 | 
22 |     @abstractmethod
23 |     async def collection_exists(self, collection: str) -> bool:
24 |         """Check if a collection exists.
25 | 
26 |         Args:
27 |             collection: Name of the collection to check
28 | 
29 |         Returns:
30 |             True if the collection exists, False otherwise
31 | 
32 |         Raises:
33 |             ConnectionError: If unable to check collection existence
34 |         """
35 |         pass
36 | 
37 | 
38 | class VectorSearchProvider(ABC):
39 |     """Interface for vector search operations."""
40 | 
41 |     @abstractmethod
42 |     def execute_vector_search(
43 |         self, client: Any, vector: List[float], field: str, top_k: Optional[int] = None
44 |     ) -> Dict[str, Any]:
45 |         """Execute a vector similarity search.
46 | 
47 |         Args:
48 |             client: Solr client instance
49 |             vector: Dense vector for similarity search
50 |             field: DenseVector field to search against
51 |             top_k: Number of top results to return
52 | 
53 |         Returns:
54 |             Search results as a dictionary
55 | 
56 |         Raises:
57 |             SolrError: If vector search fails
58 |         """
59 |         pass
60 | 
61 |     @abstractmethod
62 |     async def get_vector(self, text: str) -> List[float]:
63 |         """Get vector for text.
64 | 
65 |         Args:
66 |             text: Text to convert to vector
67 | 
68 |         Returns:
69 |             Vector as list of floats
70 | 
71 |         Raises:
72 |             SolrError: If vector generation fails
73 |         """
74 |         pass
75 | 
```
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
```toml
 1 | [tool.poetry]
 2 | name = "solr-mcp"
 3 | version = "0.1.0"
 4 | description = "A Python package for accessing Solr indexes via Model Context Protocol (MCP)"
 5 | authors = ["Allen Day <[email protected]>"]
 6 | readme = "README.md"
 7 | license = "MIT"
 8 | repository = "https://github.com/allenday/solr-mcp"
 9 | packages = [{include = "solr_mcp"}]
10 | 
11 | [tool.poetry.scripts]
12 | solr-mcp = "solr_mcp.server:main"
13 | lint = "scripts.lint:main"
14 | format = "scripts.format:main"
15 | 
16 | [tool.poetry.dependencies]
17 | python = "^3.10"
18 | pysolr = "^3.9.0"
19 | mcp = "^1.4.1"
20 | httpx = "^0.27.0"
21 | pydantic = "^2.6.1"
22 | numpy = "^1.26.3"
23 | markdown = "^3.5.2"
24 | fastapi = "^0.109.2"
25 | uvicorn = "^0.27.1"
26 | python-frontmatter = "^1.1.0"
27 | loguru = "^0.7.3"
28 | kazoo = "^2.10.0"
29 | sqlglot = "^26.11.1"
30 | pytest-mock = "^3.14.0"
31 | 
32 | [tool.poetry.group.dev.dependencies]
33 | pytest = "^8.0.0"
34 | mypy = "^1.8.0"
35 | flake8 = "^7.0.0"
36 | black = "^24.2.0"
37 | isort = "^5.13.2"
38 | pytest-cov = "^6.0.0"
39 | pytest-asyncio = "^0.25.3"
40 | 
41 | [build-system]
42 | requires = ["poetry-core"]
43 | build-backend = "poetry.core.masonry.api"
44 | 
45 | [tool.pytest.ini_options]
46 | asyncio_mode = "strict"
47 | asyncio_default_fixture_loop_scope = "function"
48 | markers = [
49 |     "integration: marks tests that require external services (deselect with '-m \"not integration\"')"
50 | ]
51 | 
52 | [tool.mypy]
53 | python_version = "3.10"
54 | warn_return_any = true
55 | warn_unused_configs = true
56 | disallow_untyped_defs = true
57 | disallow_incomplete_defs = true
58 | check_untyped_defs = true
59 | disallow_untyped_decorators = true
60 | no_implicit_optional = true
61 | warn_redundant_casts = true
62 | warn_unused_ignores = true
63 | warn_no_return = true
64 | warn_unreachable = true
65 | 
66 | [[tool.mypy.overrides]]
67 | module = "tests.*"
68 | disallow_untyped_defs = false
69 | disallow_incomplete_defs = false
70 | 
71 | [tool.black]
72 | line-length = 88
73 | target-version = ['py310']
74 | include = '\.pyi?$'
75 | 
76 | [tool.isort]
77 | profile = "black"
78 | line_length = 88
79 | multi_line_output = 3
80 | 
81 | [tool.flake8]
82 | max-line-length = 88
83 | extend-ignore = ["E203"]
84 | exclude = [".venv", ".git", "__pycache__", "build", "dist"]
```
--------------------------------------------------------------------------------
/tests/unit/tools/test_solr_list_collections.py:
--------------------------------------------------------------------------------
```python
 1 | """Tests for Solr list collections tool."""
 2 | 
 3 | from unittest.mock import AsyncMock, MagicMock, patch
 4 | 
 5 | import pytest
 6 | from mcp.server.fastmcp.exceptions import ToolError
 7 | 
 8 | from solr_mcp.server import SolrMCPServer
 9 | from solr_mcp.tools.solr_list_collections import execute_list_collections
10 | 
11 | 
12 | @pytest.mark.asyncio
13 | class TestListCollectionsTool:
14 |     """Test list collections tool."""
15 | 
16 |     async def test_execute_list_collections_requires_server_instance(self):
17 |         """Test that execute_list_collections requires a proper server instance."""
18 |         # Test with string instead of server instance
19 |         with pytest.raises(
20 |             AttributeError, match="'str' object has no attribute 'solr_client'"
21 |         ):
22 |             await execute_list_collections("server")
23 | 
24 |     async def test_execute_list_collections_success(self):
25 |         """Test successful list collections execution."""
26 |         # Create mock server instance with solr_client
27 |         mock_server = MagicMock(spec=SolrMCPServer)
28 |         mock_solr_client = AsyncMock()
29 |         mock_solr_client.list_collections.return_value = ["unified", "collection2"]
30 |         mock_server.solr_client = mock_solr_client
31 | 
32 |         # Execute tool
33 |         result = await execute_list_collections(mock_server)
34 | 
35 |         # Verify result
36 |         assert isinstance(result, list)
37 |         assert "unified" in result
38 |         assert len(result) == 2
39 |         mock_solr_client.list_collections.assert_called_once()
40 | 
41 |     async def test_execute_list_collections_error(self):
42 |         """Test list collections error handling."""
43 |         # Create mock server instance with failing solr_client
44 |         mock_server = MagicMock(spec=SolrMCPServer)
45 |         mock_solr_client = AsyncMock()
46 |         mock_solr_client.list_collections.side_effect = Exception(
47 |             "Failed to list collections"
48 |         )
49 |         mock_server.solr_client = mock_solr_client
50 | 
51 |         # Execute tool and verify error is propagated
52 |         with pytest.raises(Exception, match="Failed to list collections"):
53 |             await execute_list_collections(mock_server)
54 | 
```
--------------------------------------------------------------------------------
/solr_mcp/tools/solr_default_vectorizer.py:
--------------------------------------------------------------------------------
```python
 1 | """Tool for getting information about the default vector provider."""
 2 | 
 3 | import re
 4 | from typing import Any, Dict
 5 | from urllib.parse import urlparse
 6 | 
 7 | from solr_mcp.tools.tool_decorator import tool
 8 | from solr_mcp.vector_provider.constants import DEFAULT_OLLAMA_CONFIG, MODEL_DIMENSIONS
 9 | 
10 | 
11 | @tool()
12 | async def get_default_text_vectorizer(mcp) -> Dict[str, Any]:
13 |     """Get information about the default vector provider used for semantic search.
14 | 
15 |     Returns information about the default vector provider configuration used for semantic search,
16 |     including the model name, vector dimensionality, host, and port.
17 | 
18 |     This information is useful for ensuring that your vector fields in Solr have
19 |     the correct dimensionality to match the vector provider model.
20 | 
21 |     Returns:
22 |         Dictionary containing:
23 |         - vector_provider_model: The name of the default vector provider model
24 |         - vector_provider_dimension: The dimensionality of vectors produced by this model
25 |         - vector_provider_host: The host of the vector provider service
26 |         - vector_provider_port: The port of the vector provider service
27 |         - vector_provider_url: The full URL of the vector provider service
28 |     """
29 |     if hasattr(mcp, "solr_client") and hasattr(mcp.solr_client, "vector_manager"):
30 |         vector_manager = mcp.solr_client.vector_manager
31 |         model_name = vector_manager.client.model
32 |         dimension = MODEL_DIMENSIONS.get(model_name, 768)
33 |         base_url = vector_manager.client.base_url
34 |     else:
35 |         # Fall back to defaults
36 |         model_name = DEFAULT_OLLAMA_CONFIG["model"]
37 |         dimension = MODEL_DIMENSIONS.get(model_name, 768)
38 |         base_url = DEFAULT_OLLAMA_CONFIG["base_url"]
39 | 
40 |     # Parse URL to extract host and port
41 |     parsed_url = urlparse(base_url)
42 |     host = parsed_url.hostname or "localhost"
43 |     port = parsed_url.port or 11434  # Default Ollama port
44 | 
45 |     # Format as "model@host:port" for easy use with vector_provider parameter
46 |     formatted_spec = f"{model_name}@{host}:{port}"
47 | 
48 |     return {
49 |         "vector_provider_model": model_name,
50 |         "vector_provider_dimension": dimension,
51 |         "vector_provider_host": host,
52 |         "vector_provider_port": port,
53 |         "vector_provider_url": base_url,
54 |         "vector_provider_spec": formatted_spec,
55 |     }
56 | 
```
--------------------------------------------------------------------------------
/scripts/index_documents.py:
--------------------------------------------------------------------------------
```python
 1 | #!/usr/bin/env python3
 2 | """
 3 | Script to index documents in Solr with vector embeddings
 4 | generated using Ollama's nomic-embed-text model.
 5 | """
 6 | 
 7 | import argparse
 8 | import asyncio
 9 | import json
10 | import os
11 | import sys
12 | from typing import Dict, List
13 | 
14 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
15 | 
16 | from solr_mcp.embeddings.client import OllamaClient
17 | from solr_mcp.solr.client import SolrClient
18 | 
19 | 
20 | async def index_documents(json_file: str, collection: str = "vectors", commit: bool = True):
21 |     """
22 |     Index documents from a JSON file into Solr with vector embeddings.
23 |     
24 |     Args:
25 |         json_file: Path to the JSON file containing documents
26 |         collection: Solr collection name
27 |         commit: Whether to commit after indexing
28 |     """
29 |     # Load documents
30 |     with open(json_file, 'r', encoding='utf-8') as f:
31 |         documents = json.load(f)
32 |     
33 |     # Initialize clients
34 |     solr_client = SolrClient()
35 |     
36 |     # Check if collection exists
37 |     collections = solr_client.list_collections()
38 |     if collection not in collections:
39 |         print(f"Warning: Collection '{collection}' not found in Solr. Available collections: {collections}")
40 |         response = input("Do you want to continue with the default collection? (y/N): ")
41 |         if response.lower() != 'y':
42 |             print("Aborting.")
43 |             return
44 |         collection = solr_client.config.default_collection
45 |     
46 |     # Index documents with embeddings
47 |     print(f"Indexing {len(documents)} documents with embeddings...")
48 |     
49 |     try:
50 |         success = await solr_client.batch_index_with_generated_embeddings(
51 |             documents=documents,
52 |             collection=collection,
53 |             commit=commit
54 |         )
55 |         
56 |         if success:
57 |             print(f"Successfully indexed {len(documents)} documents in collection '{collection}'")
58 |         else:
59 |             print("Indexing failed")
60 |             
61 |     except Exception as e:
62 |         print(f"Error indexing documents: {e}")
63 | 
64 | 
65 | if __name__ == "__main__":
66 |     parser = argparse.ArgumentParser(description="Index documents in Solr with vector embeddings")
67 |     parser.add_argument("json_file", help="Path to the JSON file containing documents")
68 |     parser.add_argument("--collection", "-c", default="vectors", help="Solr collection name")
69 |     parser.add_argument("--no-commit", dest="commit", action="store_false", help="Don't commit after indexing")
70 |     
71 |     args = parser.parse_args()
72 |     
73 |     asyncio.run(index_documents(args.json_file, args.collection, args.commit))
```
--------------------------------------------------------------------------------
/tests/unit/fixtures/server_fixtures.py:
--------------------------------------------------------------------------------
```python
 1 | """Server fixtures for unit tests."""
 2 | 
 3 | import json
 4 | from unittest.mock import AsyncMock, MagicMock, Mock, patch
 5 | 
 6 | import pytest
 7 | 
 8 | from solr_mcp.server import SolrMCPServer
 9 | 
10 | from .common import MOCK_RESPONSES
11 | 
12 | 
13 | @pytest.fixture
14 | def mock_server(mock_solr_client, mock_config):
15 |     """Create a mock SolrMCPServer for testing."""
16 |     server = SolrMCPServer(
17 |         solr_base_url=mock_config.solr_base_url,
18 |         zookeeper_hosts=mock_config.zookeeper_hosts,
19 |         connection_timeout=mock_config.connection_timeout,
20 |     )
21 |     server.solr_client = mock_solr_client
22 |     return server
23 | 
24 | 
25 | @pytest.fixture
26 | def mock_server_instance():
27 |     """Create a mock FastMCP server instance for testing."""
28 |     mock_server = MagicMock()
29 | 
30 |     # Mock list collections response
31 |     async def mock_list_collections(*args, **kwargs):
32 |         return [{"type": "text", "text": json.dumps(MOCK_RESPONSES["collections"])}]
33 | 
34 |     mock_server.list_collections = AsyncMock(side_effect=mock_list_collections)
35 | 
36 |     # Mock select query response
37 |     async def mock_select(*args, **kwargs):
38 |         return [
39 |             {
40 |                 "type": "text",
41 |                 "text": json.dumps({"rows": [{"id": "1", "title": "Test Doc"}]}),
42 |             }
43 |         ]
44 | 
45 |     mock_server.select = AsyncMock(side_effect=mock_select)
46 | 
47 |     # Mock vector select response
48 |     async def mock_vector_select(*args, **kwargs):
49 |         return [
50 |             {
51 |                 "type": "text",
52 |                 "text": json.dumps({"rows": [{"id": "1", "title": "Test Doc"}]}),
53 |             }
54 |         ]
55 | 
56 |     mock_server.vector_select = AsyncMock(side_effect=mock_vector_select)
57 | 
58 |     # Mock semantic select response
59 |     async def mock_semantic_select(*args, **kwargs):
60 |         return [
61 |             {
62 |                 "type": "text",
63 |                 "text": json.dumps({"rows": [{"id": "1", "title": "Test Doc"}]}),
64 |             }
65 |         ]
66 | 
67 |     mock_server.semantic_select = AsyncMock(side_effect=mock_semantic_select)
68 | 
69 |     return mock_server
70 | 
71 | 
72 | @pytest.fixture
73 | def mock_singleton_server():
74 |     """Mock SolrMCPServer for singleton pattern testing."""
75 |     # Create a mock class to avoid affecting real singleton
76 |     MockServer = Mock(spec=SolrMCPServer)
77 |     MockServer._instance = None
78 | 
79 |     # Create a proper classmethod mock
80 |     def get_instance():
81 |         return MockServer._instance
82 | 
83 |     MockServer.get_instance = classmethod(get_instance)
84 | 
85 |     # Create two different instances
86 |     server1 = Mock(spec=SolrMCPServer)
87 |     server2 = Mock(spec=SolrMCPServer)
88 | 
89 |     with patch("solr_mcp.server.SolrMCPServer", MockServer):
90 |         yield {"MockServer": MockServer, "server1": server1, "server2": server2}
91 | 
```
--------------------------------------------------------------------------------
/tests/unit/test_config.py:
--------------------------------------------------------------------------------
```python
 1 | """Tests for Solr configuration."""
 2 | 
 3 | import json
 4 | from unittest.mock import mock_open, patch
 5 | 
 6 | import pytest
 7 | 
 8 | from solr_mcp.solr.config import SolrConfig
 9 | from solr_mcp.solr.exceptions import ConfigurationError
10 | 
11 | 
12 | def test_config_defaults():
13 |     """Test default configuration values."""
14 |     config = SolrConfig(
15 |         solr_base_url="http://test:8983/solr", zookeeper_hosts=["test:2181"]
16 |     )
17 |     assert config.solr_base_url == "http://test:8983/solr"
18 |     assert config.zookeeper_hosts == ["test:2181"]
19 |     assert config.connection_timeout == 10
20 | 
21 | 
22 | def test_config_custom_values():
23 |     """Test custom configuration values."""
24 |     config = SolrConfig(
25 |         solr_base_url="http://custom:8983/solr",
26 |         zookeeper_hosts=["custom:2181"],
27 |         connection_timeout=20,
28 |     )
29 |     assert config.solr_base_url == "http://custom:8983/solr"
30 |     assert config.zookeeper_hosts == ["custom:2181"]
31 |     assert config.connection_timeout == 20
32 | 
33 | 
34 | def test_config_validation():
35 |     """Test configuration validation."""
36 |     with pytest.raises(ConfigurationError, match="solr_base_url is required"):
37 |         SolrConfig(zookeeper_hosts=["test:2181"])
38 | 
39 |     with pytest.raises(ConfigurationError, match="zookeeper_hosts is required"):
40 |         SolrConfig(solr_base_url="http://test:8983/solr")
41 | 
42 |     with pytest.raises(ConfigurationError, match="connection_timeout must be positive"):
43 |         SolrConfig(
44 |             solr_base_url="http://test:8983/solr",
45 |             zookeeper_hosts=["test:2181"],
46 |             connection_timeout=0,
47 |         )
48 | 
49 | 
50 | def test_load_from_file():
51 |     """Test loading configuration from file."""
52 |     config_data = {
53 |         "solr_base_url": "http://test:8983/solr",
54 |         "zookeeper_hosts": ["test:2181"],
55 |         "connection_timeout": 20,
56 |     }
57 | 
58 |     with patch("builtins.open", mock_open(read_data=json.dumps(config_data))):
59 |         config = SolrConfig.load("config.json")
60 |         assert config.solr_base_url == "http://test:8983/solr"
61 |         assert config.zookeeper_hosts == ["test:2181"]
62 |         assert config.connection_timeout == 20
63 | 
64 | 
65 | def test_load_invalid_json():
66 |     """Test loading invalid JSON."""
67 |     with patch("builtins.open", mock_open(read_data="invalid json")):
68 |         with pytest.raises(
69 |             ConfigurationError, match="Invalid JSON in configuration file"
70 |         ):
71 |             SolrConfig.load("config.json")
72 | 
73 | 
74 | def test_load_missing_required_field():
75 |     """Test loading config with missing required field."""
76 |     config_data = {
77 |         "solr_base_url": "http://test:8983/solr"
78 |         # Missing zookeeper_hosts
79 |     }
80 | 
81 |     with patch("builtins.open", mock_open(read_data=json.dumps(config_data))):
82 |         with pytest.raises(ConfigurationError, match="zookeeper_hosts is required"):
83 |             SolrConfig.load("config.json")
84 | 
```
--------------------------------------------------------------------------------
/scripts/direct_mcp_test.py:
--------------------------------------------------------------------------------
```python
  1 | #!/usr/bin/env python3
  2 | """
  3 | Direct MCP server test script.
  4 | Tests the raw JSON-RPC interface that Claude uses to communicate with MCP servers.
  5 | """
  6 | 
  7 | import sys
  8 | import os
  9 | import json
 10 | import subprocess
 11 | import time
 12 | from threading import Thread
 13 | import tempfile
 14 | 
 15 | # Add the project root to your path
 16 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 17 | 
 18 | # First clean up any existing MCP servers
 19 | os.system("pkill -f 'python -m solr_mcp.server'")
 20 | time.sleep(1)  # Let them shut down
 21 | 
 22 | def write_to_stdin(process, data):
 23 |     """Write data to the stdin of a process and flush."""
 24 |     process.stdin.write(data)
 25 |     process.stdin.flush()
 26 | 
 27 | def read_from_stdout(process):
 28 |     """Read a JSON-RPC message from stdout of a process."""
 29 |     line = process.stdout.readline().strip()
 30 |     if not line:
 31 |         return None
 32 |     try:
 33 |         return json.loads(line)
 34 |     except json.JSONDecodeError:
 35 |         print(f"Error decoding JSON: {line}")
 36 |         return None
 37 | 
 38 | # Start a new MCP server process
 39 | cmd = ["python", "-m", "solr_mcp.server"]
 40 | server_process = subprocess.Popen(
 41 |     cmd,
 42 |     stdin=subprocess.PIPE,
 43 |     stdout=subprocess.PIPE,
 44 |     stderr=subprocess.PIPE,
 45 |     text=True,
 46 |     bufsize=1,  # Line buffered
 47 | )
 48 | 
 49 | print("MCP server started.")
 50 | time.sleep(2)  # Give it time to initialize
 51 | 
 52 | # Test search methods
 53 | def test_search(query):
 54 |     print(f"\n\nTesting search for: '{query}'")
 55 |     
 56 |     # Try a standard search
 57 |     request = {
 58 |         "jsonrpc": "2.0",
 59 |         "id": "1",
 60 |         "method": "execute_tool",
 61 |         "params": {
 62 |             "name": "solr_search",
 63 |             "arguments": {
 64 |                 "query": query
 65 |             }
 66 |         }
 67 |     }
 68 |     
 69 |     print("\nSending search request:", json.dumps(request, indent=2))
 70 |     write_to_stdin(server_process, json.dumps(request) + "\n")
 71 |     response = read_from_stdout(server_process)
 72 |     print("\nGot response:", json.dumps(response, indent=2) if response else "No response")
 73 |     
 74 |     # Try a hybrid search
 75 |     request = {
 76 |         "jsonrpc": "2.0",
 77 |         "id": "2",
 78 |         "method": "execute_tool",
 79 |         "params": {
 80 |             "name": "solr_hybrid_search",
 81 |             "arguments": {
 82 |                 "query": query,
 83 |                 "blend_factor": 0.5
 84 |             }
 85 |         }
 86 |     }
 87 |     
 88 |     print("\nSending hybrid search request:", json.dumps(request, indent=2))
 89 |     write_to_stdin(server_process, json.dumps(request) + "\n")
 90 |     response = read_from_stdout(server_process)
 91 |     print("\nGot hybrid response:", json.dumps(response, indent=2) if response else "No response")
 92 | 
 93 | # Test with a query we know exists
 94 | test_search("double spend")
 95 | 
 96 | # Test with another query
 97 | test_search("blockchain")
 98 | 
 99 | # Clean up
100 | print("\nCleaning up...")
101 | server_process.terminate()
102 | server_process.wait()
103 | print("Done!")
```
--------------------------------------------------------------------------------
/tests/unit/tools/test_base.py:
--------------------------------------------------------------------------------
```python
 1 | """Tests for base tool decorator."""
 2 | 
 3 | from typing import Dict, List
 4 | 
 5 | import pytest
 6 | 
 7 | from solr_mcp.tools.base import tool
 8 | 
 9 | 
10 | def test_tool_decorator_default_values():
11 |     """Test tool decorator with default values."""
12 | 
13 |     @tool()
14 |     def sample_tool() -> str:
15 |         """Sample tool docstring."""
16 |         return "test"
17 | 
18 |     assert hasattr(sample_tool, "_is_tool")
19 |     assert sample_tool._is_tool is True
20 |     assert sample_tool._tool_name == "sample_tool"
21 |     assert "Sample tool docstring" in sample_tool._tool_description
22 |     assert sample_tool._tool_parameters == {}
23 | 
24 | 
25 | def test_tool_decorator_custom_values():
26 |     """Test tool decorator with custom values."""
27 | 
28 |     @tool(
29 |         name="custom_name",
30 |         description="Custom description",
31 |         parameters={"param": "description"},
32 |     )
33 |     def sample_tool() -> str:
34 |         return "test"
35 | 
36 |     assert sample_tool._is_tool is True
37 |     assert sample_tool._tool_name == "custom_name"
38 |     assert sample_tool._tool_description == "Custom description"
39 |     assert sample_tool._tool_parameters == {"param": "description"}
40 | 
41 | 
42 | def test_tool_decorator_result_wrapping():
43 |     """Test that tool decorator properly wraps results."""
44 | 
45 |     @tool()
46 |     def string_tool() -> str:
47 |         return "test"
48 | 
49 |     @tool()
50 |     def dict_tool() -> Dict[str, str]:
51 |         return {"key": "value"}
52 | 
53 |     @tool()
54 |     def list_tool() -> List[Dict[str, str]]:
55 |         return [{"type": "text", "text": "test"}]
56 | 
57 |     # String result should be wrapped
58 |     result = string_tool()
59 |     assert isinstance(result, list)
60 |     assert len(result) == 1
61 |     assert result[0]["type"] == "text"
62 |     assert result[0]["text"] == "test"
63 | 
64 |     # Dict result should be wrapped
65 |     result = dict_tool()
66 |     assert isinstance(result, list)
67 |     assert len(result) == 1
68 |     assert result[0]["type"] == "text"
69 |     assert result[0]["text"] == "{'key': 'value'}"
70 | 
71 |     # List result should be returned as is
72 |     result = list_tool()
73 |     assert isinstance(result, list)
74 |     assert len(result) == 1
75 |     assert result[0]["type"] == "text"
76 |     assert result[0]["text"] == "test"
77 | 
78 | 
79 | def test_tool_decorator_preserves_function_metadata():
80 |     """Test that tool decorator preserves function metadata."""
81 | 
82 |     @tool()
83 |     def sample_tool(param1: str, param2: int = 0) -> str:
84 |         """Sample tool docstring."""
85 |         return f"{param1} {param2}"
86 | 
87 |     assert sample_tool.__name__ == "sample_tool"
88 |     assert "Sample tool docstring" in sample_tool.__doc__
89 |     # Check that the function signature is preserved
90 |     import inspect
91 | 
92 |     sig = inspect.signature(sample_tool)
93 |     assert list(sig.parameters.keys()) == ["param1", "param2"]
94 |     assert sig.parameters["param1"].annotation == str
95 |     assert sig.parameters["param2"].annotation == int
96 |     assert sig.parameters["param2"].default == 0
97 | 
```
--------------------------------------------------------------------------------
/tests/unit/fixtures/vector_fixtures.py:
--------------------------------------------------------------------------------
```python
 1 | """Vector search fixtures for unit tests."""
 2 | 
 3 | import json
 4 | from unittest.mock import MagicMock, Mock, patch
 5 | 
 6 | import pytest
 7 | import requests
 8 | 
 9 | from solr_mcp.solr.interfaces import VectorSearchProvider
10 | from solr_mcp.solr.vector.manager import VectorManager
11 | from solr_mcp.vector_provider.clients.ollama import OllamaVectorProvider
12 | 
13 | 
14 | @pytest.fixture
15 | def mock_ollama(request):
16 |     """Parameterized mock for Ollama client.
17 | 
18 |     Args:
19 |         request: Pytest request object that can contain parameters:
20 |             - vector_dim: Dimension of returned vectors
21 |             - error: Whether to simulate an error
22 |     """
23 |     # Get parameters or use defaults
24 |     vector_dim = getattr(request, "param", {}).get("vector_dim", 3)
25 |     error = getattr(request, "param", {}).get("error", False)
26 | 
27 |     provider = Mock(spec=OllamaVectorProvider)
28 | 
29 |     if error:
30 |         provider.get_vector.side_effect = Exception("Ollama API error")
31 |     else:
32 |         provider.get_vector.return_value = [0.1] * vector_dim
33 | 
34 |     return provider
35 | 
36 | 
37 | @pytest.fixture
38 | def mock_vector_provider(request):
39 |     """Parameterized mock for vector provider.
40 | 
41 |     Args:
42 |         request: Pytest request object that can contain parameters:
43 |             - vector_dim: Dimension of returned vectors
44 |             - error: Whether to simulate an error
45 |     """
46 |     # Get parameters or use defaults
47 |     vector_dim = getattr(request, "param", {}).get("vector_dim", 768)
48 |     error = getattr(request, "param", {}).get("error", False)
49 | 
50 |     provider = Mock(spec=VectorSearchProvider)
51 | 
52 |     if error:
53 |         provider.get_vector.side_effect = Exception("Vector API error")
54 |     else:
55 |         provider.get_vector.return_value = [0.1] * vector_dim
56 | 
57 |     return provider
58 | 
59 | 
60 | @pytest.fixture
61 | def mock_vector_manager(request):
62 |     """Parameterized mock VectorManager.
63 | 
64 |     Args:
65 |         request: Pytest request object that can contain parameters:
66 |             - vector_dim: Dimension of returned vectors
67 |             - error: Whether to simulate an error
68 |     """
69 |     # Get parameters or use defaults
70 |     vector_dim = getattr(request, "param", {}).get("vector_dim", 3)
71 |     error = getattr(request, "param", {}).get("error", False)
72 | 
73 |     manager = Mock(spec=VectorManager)
74 | 
75 |     if error:
76 |         manager.get_vector.side_effect = Exception("Vector generation error")
77 |     else:
78 |         manager.get_vector.return_value = [0.1] * vector_dim
79 | 
80 |     return manager
81 | 
82 | 
83 | @pytest.fixture
84 | def mock_ollama_response(request):
85 |     """Parameterized mock Ollama API response.
86 | 
87 |     Args:
88 |         request: Pytest request object that can contain parameters:
89 |             - vector_dim: Dimension of returned vectors
90 |             - model: Model name to include in response
91 |     """
92 |     # Get parameters or use defaults
93 |     vector_dim = getattr(request, "param", {}).get("vector_dim", 5)
94 |     model = getattr(request, "param", {}).get("model", "nomic-embed-text")
95 | 
96 |     return {"embedding": [0.1] * vector_dim, "model": model}
97 | 
```
--------------------------------------------------------------------------------
/tests/unit/vector_provider/test_interfaces.py:
--------------------------------------------------------------------------------
```python
  1 | """Tests for vector provider interfaces."""
  2 | 
  3 | from typing import List
  4 | 
  5 | import pytest
  6 | 
  7 | from solr_mcp.vector_provider.exceptions import (
  8 |     VectorConfigError,
  9 |     VectorConnectionError,
 10 |     VectorGenerationError,
 11 | )
 12 | from solr_mcp.vector_provider.interfaces import VectorProvider
 13 | 
 14 | 
 15 | class MockVectorProvider(VectorProvider):
 16 |     """Mock implementation of VectorProvider for testing."""
 17 | 
 18 |     def __init__(self, dimension: int = 768):
 19 |         self._dimension = dimension
 20 |         self._model = "mock-model"
 21 | 
 22 |     async def get_vector(self, text: str) -> List[float]:
 23 |         if text == "error":
 24 |             raise VectorGenerationError("Test error")
 25 |         return [0.1] * self._dimension
 26 | 
 27 |     async def get_vectors(self, texts: List[str]) -> List[List[float]]:
 28 |         if any(t == "error" for t in texts):
 29 |             raise VectorGenerationError("Test error")
 30 |         return [[0.1] * self._dimension for _ in texts]
 31 | 
 32 |     @property
 33 |     def vector_dimension(self) -> int:
 34 |         return self._dimension
 35 | 
 36 |     @property
 37 |     def model_name(self) -> str:
 38 |         return self._model
 39 | 
 40 | 
 41 | def test_vector_provider_is_abstract():
 42 |     """Test that VectorProvider cannot be instantiated directly."""
 43 |     with pytest.raises(TypeError):
 44 |         VectorProvider()
 45 | 
 46 | 
 47 | def test_vector_provider_requires_methods():
 48 |     """Test that implementing class must define all abstract methods."""
 49 | 
 50 |     class IncompleteProvider(VectorProvider):
 51 |         pass
 52 | 
 53 |     with pytest.raises(TypeError):
 54 |         IncompleteProvider()
 55 | 
 56 | 
 57 | @pytest.mark.asyncio
 58 | async def test_mock_provider_get_vector():
 59 |     """Test get_vector implementation."""
 60 |     provider = MockVectorProvider()
 61 |     result = await provider.get_vector("test")
 62 |     assert len(result) == 768
 63 |     assert all(x == 0.1 for x in result)
 64 | 
 65 | 
 66 | @pytest.mark.asyncio
 67 | async def test_mock_provider_get_vector_error():
 68 |     """Test get_vector error handling."""
 69 |     provider = MockVectorProvider()
 70 |     with pytest.raises(VectorGenerationError):
 71 |         await provider.get_vector("error")
 72 | 
 73 | 
 74 | @pytest.mark.asyncio
 75 | async def test_mock_provider_get_vectors():
 76 |     """Test get_vectors implementation."""
 77 |     provider = MockVectorProvider()
 78 |     texts = ["test1", "test2"]
 79 |     result = await provider.get_vectors(texts)
 80 |     assert len(result) == 2
 81 |     assert all(len(v) == 768 for v in result)
 82 |     assert all(all(x == 0.1 for x in v) for v in result)
 83 | 
 84 | 
 85 | @pytest.mark.asyncio
 86 | async def test_mock_provider_get_vectors_error():
 87 |     """Test get_vectors error handling."""
 88 |     provider = MockVectorProvider()
 89 |     with pytest.raises(VectorGenerationError):
 90 |         await provider.get_vectors(["test", "error"])
 91 | 
 92 | 
 93 | def test_mock_provider_vector_dimension():
 94 |     """Test vector_dimension property."""
 95 |     provider = MockVectorProvider(dimension=512)
 96 |     assert provider.vector_dimension == 512
 97 | 
 98 | 
 99 | def test_mock_provider_model_name():
100 |     """Test model_name property."""
101 |     provider = MockVectorProvider()
102 |     assert provider.model_name == "mock-model"
103 | 
```
--------------------------------------------------------------------------------
/tests/unit/tools/test_solr_list_fields.py:
--------------------------------------------------------------------------------
```python
 1 | """Tests for the list fields tool."""
 2 | 
 3 | import pytest
 4 | 
 5 | from solr_mcp.solr.exceptions import SolrError
 6 | from solr_mcp.tools.solr_list_fields import execute_list_fields
 7 | 
 8 | # Sample field data for testing
 9 | FIELD_DATA = {
10 |     "fields": [
11 |         {"name": "id", "type": "string", "indexed": True, "stored": True},
12 |         {
13 |             "name": "_text_",
14 |             "type": "text_general",
15 |             "indexed": True,
16 |             "stored": False,
17 |             "copies_from": ["title", "content"],
18 |         },
19 |     ]
20 | }
21 | 
22 | 
23 | @pytest.mark.asyncio
24 | @pytest.mark.parametrize(
25 |     "collection,custom_fields",
26 |     [
27 |         ("test_collection", None),
28 |         (
29 |             "custom_collection",
30 |             [
31 |                 {
32 |                     "name": "custom_id",
33 |                     "type": "string",
34 |                     "indexed": True,
35 |                     "stored": True,
36 |                 },
37 |                 {
38 |                     "name": "custom_text",
39 |                     "type": "text_general",
40 |                     "indexed": True,
41 |                     "stored": False,
42 |                 },
43 |             ],
44 |         ),
45 |     ],
46 | )
47 | async def test_execute_list_fields_success(mock_server, collection, custom_fields):
48 |     """Test successful execution of list_fields tool with different collections and field sets."""
49 |     # Use default fields or custom fields based on parameter
50 |     fields = custom_fields or FIELD_DATA["fields"]
51 |     mock_server.solr_client.list_fields.return_value = fields
52 | 
53 |     # Execute the tool
54 |     result = await execute_list_fields(mock_server, collection)
55 | 
56 |     # Verify the result
57 |     assert result["collection"] == collection
58 |     assert len(result["fields"]) == len(fields)
59 |     assert result["fields"][0]["name"] == fields[0]["name"]
60 | 
61 |     # Check for copies_from in the default test case
62 |     if custom_fields is None and "copies_from" in fields[1]:
63 |         assert "copies_from" in result["fields"][1]
64 | 
65 |     # Verify the correct collection was used
66 |     mock_server.solr_client.list_fields.assert_called_once_with(collection)
67 | 
68 | 
69 | @pytest.mark.asyncio
70 | @pytest.mark.parametrize(
71 |     "error_message",
72 |     ["Failed to list fields", "Collection not found", "Connection error"],
73 | )
74 | async def test_execute_list_fields_error(mock_solr_client, mock_config, error_message):
75 |     """Test error handling in list_fields tool with different error messages."""
76 |     # Create a server with a parameterized error client
77 |     error_client = mock_solr_client(param={"error": True})
78 | 
79 |     from solr_mcp.server import SolrMCPServer
80 | 
81 |     server = SolrMCPServer(
82 |         solr_base_url=mock_config.solr_base_url,
83 |         zookeeper_hosts=mock_config.zookeeper_hosts,
84 |         connection_timeout=mock_config.connection_timeout,
85 |     )
86 |     server.solr_client = error_client
87 | 
88 |     # Override the exception message
89 |     error_client.list_fields.side_effect = SolrError(error_message)
90 | 
91 |     # Verify the exception is raised with the correct message
92 |     with pytest.raises(SolrError, match=error_message):
93 |         await execute_list_fields(server, "test_collection")
94 | 
```
--------------------------------------------------------------------------------
/solr_mcp/solr/schema/cache.py:
--------------------------------------------------------------------------------
```python
  1 | """Field caching for SolrCloud client."""
  2 | 
  3 | import logging
  4 | import time
  5 | from typing import Any, Dict, List, Optional
  6 | 
  7 | from loguru import logger
  8 | 
  9 | from solr_mcp.solr.constants import SYNTHETIC_SORT_FIELDS
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | 
 14 | class FieldCache:
 15 |     """Caches field information for Solr collections."""
 16 | 
 17 |     def __init__(self):
 18 |         """Initialize the FieldCache."""
 19 |         self._cache: Dict[str, Dict[str, Any]] = {}
 20 | 
 21 |     def get(self, collection: str) -> Optional[Dict[str, Any]]:
 22 |         """Get cached field information for a collection.
 23 | 
 24 |         Args:
 25 |             collection: Collection name
 26 | 
 27 |         Returns:
 28 |             Dict containing field information or None if not cached
 29 |         """
 30 |         if collection in self._cache:
 31 |             return self._cache[collection]
 32 |         return None
 33 | 
 34 |     def set(self, collection: str, field_info: Dict[str, Any]) -> None:
 35 |         """Cache field information for a collection.
 36 | 
 37 |         Args:
 38 |             collection: Collection name
 39 |             field_info: Field information to cache
 40 |         """
 41 |         self._cache[collection] = {**field_info, "last_updated": time.time()}
 42 | 
 43 |     def is_stale(self, collection: str, max_age: float = 300.0) -> bool:
 44 |         """Check if cached field information is stale.
 45 | 
 46 |         Args:
 47 |             collection: Collection name
 48 |             max_age: Maximum age in seconds before cache is considered stale
 49 | 
 50 |         Returns:
 51 |             True if cache is stale or missing, False otherwise
 52 |         """
 53 |         if collection not in self._cache:
 54 |             return True
 55 | 
 56 |         last_updated = self._cache[collection].get("last_updated", 0)
 57 |         return (time.time() - last_updated) > max_age
 58 | 
 59 |     def get_or_default(self, collection: str) -> Dict[str, Any]:
 60 |         """Get cached field information or return defaults.
 61 | 
 62 |         Args:
 63 |             collection: Collection name
 64 | 
 65 |         Returns:
 66 |             Dict containing field information (cached or default)
 67 |         """
 68 |         if collection in self._cache:
 69 |             return self._cache[collection]
 70 | 
 71 |         # Return safe defaults
 72 |         return {
 73 |             "searchable_fields": ["_text_"],
 74 |             "sortable_fields": {"score": SYNTHETIC_SORT_FIELDS["score"]},
 75 |             "last_updated": time.time(),
 76 |         }
 77 | 
 78 |     def clear(self, collection: Optional[str] = None) -> None:
 79 |         """Clear cached field information.
 80 | 
 81 |         Args:
 82 |             collection: Collection name to clear, or None to clear all
 83 |         """
 84 |         if collection:
 85 |             self._cache.pop(collection, None)
 86 |         else:
 87 |             self._cache.clear()
 88 | 
 89 |     def update(self, collection: str, field_info: Dict[str, Any]) -> None:
 90 |         """Update cached field information.
 91 | 
 92 |         Args:
 93 |             collection: Collection name
 94 |             field_info: Field information to update
 95 |         """
 96 |         if collection in self._cache:
 97 |             self._cache[collection].update(field_info)
 98 |             self._cache[collection]["last_updated"] = time.time()
 99 |         else:
100 |             self.set(collection, field_info)
101 | 
```
--------------------------------------------------------------------------------
/tests/unit/fixtures/config_fixtures.py:
--------------------------------------------------------------------------------
```python
  1 | """Configuration fixtures for unit tests."""
  2 | 
  3 | from unittest.mock import Mock, mock_open, patch
  4 | 
  5 | import pytest
  6 | 
  7 | from solr_mcp.solr.config import SolrConfig
  8 | 
  9 | 
 10 | @pytest.fixture
 11 | def mock_config(request):
 12 |     """Parameterized SolrConfig mock.
 13 | 
 14 |     Args:
 15 |         request: Pytest request object that can contain parameters:
 16 |             - base_url: Custom Solr base URL
 17 |             - zk_hosts: Custom ZooKeeper hosts
 18 |             - timeout: Custom connection timeout
 19 |     """
 20 |     # Get parameters or use defaults
 21 |     base_url = getattr(request, "param", {}).get(
 22 |         "base_url", "http://localhost:8983/solr"
 23 |     )
 24 |     zk_hosts = getattr(request, "param", {}).get("zk_hosts", ["localhost:2181"])
 25 |     timeout = getattr(request, "param", {}).get("timeout", 10)
 26 | 
 27 |     config = Mock(spec=SolrConfig)
 28 |     config.solr_base_url = base_url
 29 |     config.zookeeper_hosts = zk_hosts
 30 |     config.connection_timeout = timeout
 31 | 
 32 |     return config
 33 | 
 34 | 
 35 | @pytest.fixture(
 36 |     params=[
 37 |         # Format: (fixture_name, content, side_effect)
 38 |         (
 39 |             "valid",
 40 |             """
 41 |     {
 42 |         "solr_base_url": "http://solr:8983/solr",
 43 |         "zookeeper_hosts": ["zk1:2181", "zk2:2181"],
 44 |         "connection_timeout": 30
 45 |     }
 46 |     """,
 47 |             None,
 48 |         ),
 49 |         ("invalid_json", "invalid json content", None),
 50 |         (
 51 |             "minimal",
 52 |             """
 53 |     {
 54 |         "zookeeper_hosts": ["zk1:2181"]
 55 |     }
 56 |     """,
 57 |             None,
 58 |         ),
 59 |         ("missing", None, FileNotFoundError()),
 60 |     ]
 61 | )
 62 | def mock_config_file(request):
 63 |     """Parameterized fixture for different config file scenarios."""
 64 |     fixture_name, content, side_effect = request.param
 65 | 
 66 |     if side_effect:
 67 |         with patch("builtins.open", side_effect=side_effect):
 68 |             yield fixture_name
 69 |     else:
 70 |         with patch("builtins.open", mock_open(read_data=content)):
 71 |             yield fixture_name
 72 | 
 73 | 
 74 | @pytest.fixture
 75 | def mock_field_manager_methods():
 76 |     """Mock FieldManager methods for testing."""
 77 |     mock_fields = {
 78 |         "searchable_fields": ["title", "content"],
 79 |         "sortable_fields": {
 80 |             "id": {"directions": ["asc", "desc"], "default_direction": "asc"},
 81 |             "score": {
 82 |                 "directions": ["asc", "desc"],
 83 |                 "default_direction": "desc",
 84 |                 "type": "numeric",
 85 |                 "searchable": True,
 86 |             },
 87 |         },
 88 |     }
 89 | 
 90 |     def patch_get_collection_fields(field_manager):
 91 |         """Create a context manager for patching _get_collection_fields."""
 92 |         return patch.object(
 93 |             field_manager, "_get_collection_fields", return_value=mock_fields
 94 |         )
 95 | 
 96 |     def patch_get_searchable_fields(field_manager):
 97 |         """Create a context manager for patching _get_searchable_fields."""
 98 |         return patch.object(
 99 |             field_manager, "_get_searchable_fields", side_effect=Exception("API error")
100 |         )
101 | 
102 |     return {
103 |         "mock_fields": mock_fields,
104 |         "patch_get_collection_fields": patch_get_collection_fields,
105 |         "patch_get_searchable_fields": patch_get_searchable_fields,
106 |     }
107 | 
```
--------------------------------------------------------------------------------
/scripts/setup.sh:
--------------------------------------------------------------------------------
```bash
  1 | #!/bin/bash
  2 | # Setup script for Solr MCP Server
  3 | 
  4 | set -e  # Exit immediately if a command exits with a non-zero status
  5 | 
  6 | echo "=== Setting up Solr MCP Server ==="
  7 | 
  8 | # Check if Docker is installed
  9 | if ! command -v docker &> /dev/null; then
 10 |     echo "Docker is not installed. Please install Docker and Docker Compose first."
 11 |     exit 1
 12 | fi
 13 | 
 14 | # Check if Docker Compose is installed
 15 | if ! command -v docker-compose &> /dev/null; then
 16 |     echo "Docker Compose is not installed. Please install Docker Compose first."
 17 |     exit 1
 18 | fi
 19 | 
 20 | # Create Python virtual environment
 21 | echo "Creating Python virtual environment..."
 22 | python3 -m venv venv
 23 | source venv/bin/activate
 24 | 
 25 | # Install dependencies
 26 | echo "Installing dependencies..."
 27 | pip install poetry
 28 | poetry install
 29 | 
 30 | # Start Docker containers
 31 | echo "Starting SolrCloud, ZooKeeper, and Ollama containers..."
 32 | docker-compose up -d
 33 | 
 34 | # Wait for Solr to be ready
 35 | echo "Waiting for SolrCloud to be ready..."
 36 | sleep 10
 37 | attempts=0
 38 | max_attempts=30
 39 | while ! curl -s http://localhost:8983/solr/ > /dev/null; do
 40 |     attempts=$((attempts+1))
 41 |     if [ $attempts -ge $max_attempts ]; then
 42 |         echo "Error: SolrCloud did not start in time. Please check docker-compose logs."
 43 |         exit 1
 44 |     fi
 45 |     echo "Waiting for SolrCloud to start... (attempt $attempts/$max_attempts)"
 46 |     sleep 5
 47 | done
 48 | 
 49 | # Create unified collection
 50 | echo "Creating unified collection..."
 51 | python scripts/create_unified_collection.py
 52 | 
 53 | # Process demo data (Bitcoin whitepaper)
 54 | echo "Processing demo data..."
 55 | python scripts/process_markdown.py data/bitcoin-whitepaper.md --output data/processed/bitcoin_sections.json
 56 | 
 57 | # Index demo data to unified collection
 58 | echo "Indexing demo data to unified collection..."
 59 | python scripts/unified_index.py data/processed/bitcoin_sections.json --collection unified
 60 | 
 61 | # Test search to ensure content is indexed properly
 62 | echo "Testing search functionality..."
 63 | python -c "
 64 | import httpx
 65 | import asyncio
 66 | 
 67 | async def test_search():
 68 |     async with httpx.AsyncClient() as client:
 69 |         response = await client.get(
 70 |             'http://localhost:8983/solr/unified/select',
 71 |             params={
 72 |                 'q': 'content:\"double spend\"~5',
 73 |                 'wt': 'json'
 74 |             }
 75 |         )
 76 |         results = response.json()
 77 |         if results.get('response', {}).get('numFound', 0) > 0:
 78 |             print('✅ Search test successful! Found documents matching \"double spend\"')
 79 |         else:
 80 |             print('❌ Warning: No documents found for \"double spend\". Search may not work properly.')
 81 |             print('   Try running: python scripts/diagnose_search.py --collection unified --term \"double spend\"')
 82 | 
 83 | asyncio.run(test_search())
 84 | "
 85 | 
 86 | echo ""
 87 | echo "=== Setup Complete! ==="
 88 | echo ""
 89 | echo "You can now use the Solr MCP server with the following commands:"
 90 | echo ""
 91 | echo "1. Start the MCP server:"
 92 | echo "   python -m solr_mcp.server"
 93 | echo ""
 94 | echo "2. Try hybrid search on the demo data:"
 95 | echo "   python scripts/demo_hybrid_search.py \"blockchain\" --mode compare"
 96 | echo ""
 97 | echo "3. Use the Claude Desktop integration by configuring the MCP server"
 98 | echo "   in Claude's configuration file (see README.md for details)."
 99 | echo ""
100 | echo "For more information, please refer to the documentation in README.md."
```
--------------------------------------------------------------------------------
/solr_mcp/tools/solr_semantic_select.py:
--------------------------------------------------------------------------------
```python
 1 | """Tool for executing semantic search queries against Solr collections."""
 2 | 
 3 | from typing import Dict, List, Optional
 4 | 
 5 | from solr_mcp.tools.tool_decorator import tool
 6 | from solr_mcp.vector_provider.constants import DEFAULT_OLLAMA_CONFIG
 7 | 
 8 | 
 9 | @tool()
10 | async def execute_semantic_select_query(
11 |     mcp, query: str, text: str, field: Optional[str] = None, vector_provider: str = ""
12 | ) -> Dict:
13 |     """Execute semantic search queries against Solr collections.
14 | 
15 |     Extends solr_select tool with semantic search capabilities.
16 | 
17 |     Additional Parameters:
18 |     - text: Natural language text that is converted to vector, which will be used to match against other vector fields
19 |     - field: Name of the vector field to search against (optional, will auto-detect if not specified)
20 |     - vector_provider: Vector provider specification in format "model@host:port" (e.g., "nomic-embed-text@localhost:11434")
21 |        If not specified, the default vector provider will be used
22 | 
23 |     The query results will be ranked based on semantic similarity to the provided text. Therefore, ORDER BY is not allowed.
24 | 
25 |     Collection/Field Rules:
26 |     - Vector field must be a dense_vector or knn_vector field type
27 |     - The specified field must exist in the collection schema
28 |     - The vector provider's dimensionality must match the dimensionality of the vector field
29 | 
30 |     Supported Features:
31 |     - All standard SELECT query features except ORDER BY
32 |       - Results are ordered by semantic similarity
33 |     - Hybrid search combining keyword (SQL WHERE clauses) and vector distance (text parameter)
34 | 
35 |     Args:
36 |         mcp: SolrMCPServer instance
37 |         query: SQL query to execute
38 |         text: Search text to convert to vector
39 |         field: Name of the vector field to search against (optional, auto-detected if not specified)
40 |         vector_provider: Optional vector provider specification "model@host:port"
41 | 
42 |     Returns:
43 |         Query results
44 |     """
45 |     solr_client = mcp.solr_client
46 | 
47 |     # Configure vector provider from parameter string
48 |     vector_provider_config = {}
49 | 
50 |     if vector_provider:
51 |         # Parse "model@host:port" format
52 |         model_part = vector_provider
53 |         host_port_part = None
54 | 
55 |         if "@" in vector_provider:
56 |             parts = vector_provider.split("@", 1)
57 |             model_part = parts[0]
58 |             host_port_part = parts[1]
59 | 
60 |         # Set model if specified
61 |         if model_part:
62 |             vector_provider_config["model"] = model_part
63 | 
64 |         # Set host:port if specified
65 |         if host_port_part:
66 |             if ":" in host_port_part:
67 |                 host, port_str = host_port_part.split(":", 1)
68 |                 try:
69 |                     port = int(port_str)
70 |                     vector_provider_config["base_url"] = f"http://{host}:{port}"
71 |                 except ValueError:
72 |                     # If port is not a valid integer, use the host with default port
73 |                     vector_provider_config["base_url"] = f"http://{host_port_part}"
74 |             else:
75 |                 # Only host specified, use default port
76 |                 vector_provider_config["base_url"] = f"http://{host_port_part}:11434"
77 | 
78 |     return await solr_client.execute_semantic_select_query(
79 |         query, text, field, vector_provider_config
80 |     )
81 | 
```
--------------------------------------------------------------------------------
/QUICKSTART.md:
--------------------------------------------------------------------------------
```markdown
  1 | # Solr MCP Quick Start Guide
  2 | 
  3 | This guide will help you get up and running with the Solr MCP server quickly.
  4 | 
  5 | ## Prerequisites
  6 | 
  7 | - Python 3.10 or higher
  8 | - Docker and Docker Compose
  9 | - Git
 10 | 
 11 | ## Step 1: Clone the Repository
 12 | 
 13 | ```bash
 14 | git clone https://github.com/allenday/solr-mcp.git
 15 | cd solr-mcp
 16 | ```
 17 | 
 18 | ## Step 2: Start SolrCloud with Docker
 19 | 
 20 | ```bash
 21 | docker-compose up -d
 22 | ```
 23 | 
 24 | This will start a SolrCloud instance with ZooKeeper and Ollama for embedding generation.
 25 | 
 26 | Verify that Solr is running by visiting: http://localhost:8983/solr/
 27 | 
 28 | ## Step 3: Set Up Python Environment
 29 | 
 30 | ```bash
 31 | # Create a virtual environment
 32 | python -m venv venv
 33 | 
 34 | # Activate it
 35 | source venv/bin/activate  # On Windows: venv\Scripts\activate
 36 | 
 37 | # Install Poetry
 38 | pip install poetry
 39 | 
 40 | # Install dependencies
 41 | poetry install
 42 | ```
 43 | 
 44 | ## Step 4: Process and Index Sample Documents
 45 | 
 46 | The repository includes the Bitcoin whitepaper as a sample document. Let's process and index it:
 47 | 
 48 | ```bash
 49 | # Process the Markdown file into sections
 50 | python scripts/process_markdown.py data/bitcoin-whitepaper.md --output data/processed/bitcoin_sections.json
 51 | 
 52 | # Create a unified collection
 53 | python scripts/create_unified_collection.py unified
 54 | 
 55 | # Index the sections with embeddings
 56 | python scripts/unified_index.py data/processed/bitcoin_sections.json --collection unified
 57 | ```
 58 | 
 59 | ## Step 5: Run the MCP Server
 60 | 
 61 | ```bash
 62 | poetry run python -m solr_mcp.server
 63 | ```
 64 | 
 65 | By default, the server will run on http://localhost:8000
 66 | 
 67 | ## Step 6: Test the Search Functionality
 68 | 
 69 | You can test the different search capabilities using the demo scripts:
 70 | 
 71 | ```bash
 72 | # Test keyword search
 73 | python scripts/simple_search.py "double spend" --collection unified
 74 | 
 75 | # Test vector search
 76 | python scripts/vector_search.py "how does bitcoin prevent fraud" --collection unified
 77 | 
 78 | # Test hybrid search (combining keyword and vector)
 79 | python scripts/simple_mcp_test.py
 80 | ```
 81 | 
 82 | ## Using with Claude Desktop
 83 | 
 84 | To use the MCP server with Claude Desktop:
 85 | 
 86 | 1. Make sure the MCP server is running
 87 | 2. In Claude Desktop, go to Settings > Tools
 88 | 3. Add a new tool with:
 89 |    - Name: Solr Search
 90 |    - URL: http://localhost:8000
 91 |    - Working Directory: /path/to/solr-mcp
 92 | 
 93 | Now you can ask Claude queries like:
 94 | - "Search for information about double spending in the Bitcoin whitepaper"
 95 | - "Find sections related to consensus mechanisms"
 96 | - "What does the whitepaper say about transaction verification?"
 97 | 
 98 | ## Troubleshooting
 99 | 
100 | If you encounter issues:
101 | 
102 | 1. Check that Solr is running: http://localhost:8983/solr/
103 | 2. Verify the collection exists: http://localhost:8983/solr/#/~collections
104 | 3. Run the diagnostic script: `python scripts/diagnose_search.py`
105 | 4. Check the server logs for errors
106 | 
107 | ## Setup linting and formatting
108 | 
109 | We use several tools to maintain code quality:
110 | 
111 | ```bash
112 | # Run code formatters (black and isort)
113 | poetry run python scripts/format.py
114 | # Or use the poetry script
115 | poetry run format
116 | 
117 | # Run linters (flake8 and mypy)
118 | poetry run python scripts/lint.py
119 | # Or use the poetry script
120 | poetry run lint
121 | ```
122 | 
123 | You can also run individual tools:
124 | 
125 | ```bash
126 | # Format code with Black
127 | poetry run black solr_mcp tests
128 | 
129 | # Sort imports with isort
130 | poetry run isort solr_mcp tests
131 | 
132 | # Run flake8 linter
133 | poetry run flake8 solr_mcp tests
134 | 
135 | # Run mypy type checker
136 | poetry run mypy solr_mcp tests
137 | ```
138 | 
```
--------------------------------------------------------------------------------
/tests/unit/test_client.py:
--------------------------------------------------------------------------------
```python
 1 | """Unit tests for SolrClient."""
 2 | 
 3 | from unittest.mock import Mock, patch
 4 | 
 5 | import pytest
 6 | 
 7 | from solr_mcp.solr.client import SolrClient
 8 | from solr_mcp.solr.interfaces import CollectionProvider, VectorSearchProvider
 9 | 
10 | from .conftest import MOCK_RESPONSES, MockCollectionProvider, MockVectorProvider
11 | 
12 | 
13 | class TestSolrClient:
14 |     """Test cases for SolrClient."""
15 | 
16 |     def test_init_with_defaults(self, mock_config, mock_field_manager, mock_ollama):
17 |         """Test initialization with default dependencies."""
18 |         client = SolrClient(
19 |             config=mock_config,
20 |             field_manager=mock_field_manager,
21 |             vector_provider=mock_ollama,
22 |         )
23 |         assert client.config == mock_config
24 |         assert isinstance(client.collection_provider, CollectionProvider)
25 |         assert client.field_manager == mock_field_manager
26 |         assert client.vector_provider == mock_ollama
27 | 
28 |     def test_init_with_custom_providers(self, mock_config, mock_field_manager):
29 |         """Test initialization with custom providers."""
30 |         mock_collection_provider = MockCollectionProvider()
31 |         mock_vector_provider = MockVectorProvider()
32 |         mock_solr = Mock()  # Create a simple mock
33 | 
34 |         client = SolrClient(
35 |             config=mock_config,
36 |             collection_provider=mock_collection_provider,
37 |             solr_client=mock_solr,
38 |             field_manager=mock_field_manager,
39 |             vector_provider=mock_vector_provider,
40 |         )
41 |         assert client.config == mock_config
42 |         assert client.collection_provider == mock_collection_provider
43 |         assert client.field_manager == mock_field_manager
44 |         assert client.vector_provider == mock_vector_provider
45 | 
46 |     @pytest.mark.asyncio
47 |     @pytest.mark.parametrize("collection", ["collection1", "test_collection"])
48 |     async def test_execute_select_query_success(
49 |         self, mock_config, mock_field_manager, collection
50 |     ):
51 |         """Test successful SQL query execution with different collections."""
52 |         # Create a mock for the query builder
53 |         mock_query_builder = Mock()
54 |         mock_query_builder.parser = Mock()
55 |         mock_query_builder.parser.preprocess_query = Mock(
56 |             return_value=f"SELECT * FROM {collection}"
57 |         )
58 |         mock_query_builder.parse_and_validate_select = Mock(
59 |             return_value=(
60 |                 Mock(),  # AST
61 |                 collection,  # Collection name
62 |                 ["id", "title"],  # Fields
63 |             )
64 |         )
65 | 
66 |         # Create a mock response
67 |         mock_response = Mock()
68 |         mock_response.status_code = 200
69 |         mock_response.json.return_value = {
70 |             "result-set": {"docs": [{"id": "1", "field": "value"}], "numFound": 1}
71 |         }
72 | 
73 |         # Create client with dependencies and patch requests.post
74 |         with patch("requests.post", return_value=mock_response):
75 |             client = SolrClient(
76 |                 config=mock_config,
77 |                 field_manager=mock_field_manager,
78 |                 query_builder=mock_query_builder,
79 |             )
80 | 
81 |             # Execute query
82 |             result = await client.execute_select_query(f"SELECT * FROM {collection}")
83 | 
84 |             # Verify result structure
85 |             assert "result-set" in result
86 |             assert "docs" in result["result-set"]
87 |             assert result["result-set"]["docs"][0]["id"] == "1"
88 | 
```
--------------------------------------------------------------------------------
/solr_mcp/solr/zookeeper.py:
--------------------------------------------------------------------------------
```python
  1 | """ZooKeeper-based collection provider."""
  2 | 
  3 | from typing import List
  4 | 
  5 | import anyio
  6 | from kazoo.client import KazooClient
  7 | from kazoo.exceptions import ConnectionLoss, NoNodeError
  8 | 
  9 | from solr_mcp.solr.exceptions import ConnectionError
 10 | from solr_mcp.solr.interfaces import CollectionProvider
 11 | 
 12 | 
 13 | class ZooKeeperCollectionProvider(CollectionProvider):
 14 |     """Collection provider that uses ZooKeeper to discover collections."""
 15 | 
 16 |     def __init__(self, hosts: List[str]):
 17 |         """Initialize with ZooKeeper hosts.
 18 | 
 19 |         Args:
 20 |             hosts: List of ZooKeeper hosts in format host:port
 21 |         """
 22 |         self.hosts = hosts
 23 |         self.zk = None
 24 |         self.connect()
 25 | 
 26 |     def connect(self):
 27 |         """Connect to ZooKeeper and verify /collections path exists."""
 28 |         try:
 29 |             self.zk = KazooClient(hosts=",".join(self.hosts))
 30 |             self.zk.start()
 31 | 
 32 |             # Check if /collections path exists
 33 |             if not self.zk.exists("/collections"):
 34 |                 raise ConnectionError("ZooKeeper /collections path does not exist")
 35 | 
 36 |         except ConnectionLoss as e:
 37 |             raise ConnectionError(f"Failed to connect to ZooKeeper: {str(e)}")
 38 |         except Exception as e:
 39 |             raise ConnectionError(f"Error connecting to ZooKeeper: {str(e)}")
 40 | 
 41 |     def cleanup(self):
 42 |         """Clean up ZooKeeper connection."""
 43 |         if self.zk:
 44 |             try:
 45 |                 self.zk.stop()
 46 |                 self.zk.close()
 47 |             except Exception:
 48 |                 pass  # Ignore cleanup errors
 49 |             finally:
 50 |                 self.zk = None
 51 | 
 52 |     async def list_collections(self) -> List[str]:
 53 |         """List available collections from ZooKeeper.
 54 | 
 55 |         Returns:
 56 |             List of collection names
 57 | 
 58 |         Raises:
 59 |             ConnectionError: If there is an error communicating with ZooKeeper
 60 |         """
 61 |         try:
 62 |             if not self.zk:
 63 |                 raise ConnectionError("Not connected to ZooKeeper")
 64 | 
 65 |             collections = await anyio.to_thread.run_sync(
 66 |                 self.zk.get_children, "/collections"
 67 |             )
 68 |             return collections
 69 | 
 70 |         except NoNodeError:
 71 |             return []  # No collections exist yet
 72 |         except ConnectionLoss as e:
 73 |             raise ConnectionError(f"Lost connection to ZooKeeper: {str(e)}")
 74 |         except Exception as e:
 75 |             raise ConnectionError(f"Error listing collections: {str(e)}")
 76 | 
 77 |     async def collection_exists(self, collection: str) -> bool:
 78 |         """Check if a collection exists in ZooKeeper.
 79 | 
 80 |         Args:
 81 |             collection: Name of the collection to check
 82 | 
 83 |         Returns:
 84 |             True if the collection exists, False otherwise
 85 | 
 86 |         Raises:
 87 |             ConnectionError: If there is an error communicating with ZooKeeper
 88 |         """
 89 |         try:
 90 |             if not self.zk:
 91 |                 raise ConnectionError("Not connected to ZooKeeper")
 92 | 
 93 |             # Check for collection in ZooKeeper
 94 |             collection_path = f"/collections/{collection}"
 95 |             exists = await anyio.to_thread.run_sync(self.zk.exists, collection_path)
 96 |             return exists is not None
 97 | 
 98 |         except ConnectionLoss as e:
 99 |             raise ConnectionError(f"Lost connection to ZooKeeper: {str(e)}")
100 |         except Exception as e:
101 |             raise ConnectionError(f"Error checking collection existence: {str(e)}")
102 | 
```
--------------------------------------------------------------------------------
/solr_mcp/solr/exceptions.py:
--------------------------------------------------------------------------------
```python
  1 | """Solr client exceptions."""
  2 | 
  3 | from typing import Any, Dict, Optional
  4 | 
  5 | 
  6 | class SolrError(Exception):
  7 |     """Base exception for Solr-related errors."""
  8 | 
  9 |     pass
 10 | 
 11 | 
 12 | class ConfigurationError(SolrError):
 13 |     """Configuration-related errors."""
 14 | 
 15 |     pass
 16 | 
 17 | 
 18 | class ConnectionError(SolrError):
 19 |     """Exception raised for connection-related errors."""
 20 | 
 21 |     pass
 22 | 
 23 | 
 24 | class QueryError(SolrError):
 25 |     """Base exception for query-related errors."""
 26 | 
 27 |     def __init__(
 28 |         self,
 29 |         message: str,
 30 |         error_type: Optional[str] = None,
 31 |         response_time: Optional[int] = None,
 32 |     ):
 33 |         self.message = message
 34 |         self.error_type = error_type
 35 |         self.response_time = response_time
 36 |         super().__init__(self.message)
 37 | 
 38 |     def to_dict(self) -> Dict[str, Any]:
 39 |         """Convert the error to a dictionary format."""
 40 |         return {
 41 |             "error_type": self.error_type,
 42 |             "message": self.message,
 43 |             "response_time": self.response_time,
 44 |         }
 45 | 
 46 | 
 47 | class DocValuesError(QueryError):
 48 |     """Exception raised when a query requires DocValues but fields don't have them enabled."""
 49 | 
 50 |     def __init__(self, message: str, response_time: Optional[int] = None):
 51 |         super().__init__(
 52 |             message, error_type="MISSING_DOCVALUES", response_time=response_time
 53 |         )
 54 | 
 55 | 
 56 | class SQLParseError(QueryError):
 57 |     """Exception raised when SQL query parsing fails."""
 58 | 
 59 |     def __init__(self, message: str, response_time: Optional[int] = None):
 60 |         super().__init__(message, error_type="PARSE_ERROR", response_time=response_time)
 61 | 
 62 | 
 63 | class SQLExecutionError(QueryError):
 64 |     """Exception raised for other SQL execution errors."""
 65 | 
 66 |     def __init__(self, message: str, response_time: Optional[int] = None):
 67 |         super().__init__(
 68 |             message, error_type="SOLR_SQL_ERROR", response_time=response_time
 69 |         )
 70 | 
 71 | 
 72 | class SchemaError(SolrError):
 73 |     """Base exception for schema-related errors."""
 74 | 
 75 |     def __init__(
 76 |         self,
 77 |         message: str,
 78 |         error_type: str = "schema_error",
 79 |         collection: str = "unknown",
 80 |     ):
 81 |         """Initialize SchemaError.
 82 | 
 83 |         Args:
 84 |             message: Error message
 85 |             error_type: Type of schema error
 86 |             collection: Collection name
 87 |         """
 88 |         self.error_type = error_type
 89 |         self.collection = collection
 90 |         super().__init__(message)
 91 | 
 92 |     def to_dict(self) -> Dict[str, Any]:
 93 |         """Convert the error to a dictionary format."""
 94 |         return {
 95 |             "error_type": self.error_type,
 96 |             "message": self.message,
 97 |             "collection": self.collection,
 98 |         }
 99 | 
100 | 
101 | class CollectionNotFoundError(SchemaError):
102 |     """Exception raised when a collection does not exist."""
103 | 
104 |     def __init__(self, collection: str):
105 |         super().__init__(
106 |             message=f"Collection '{collection}' not found",
107 |             error_type="COLLECTION_NOT_FOUND",
108 |             collection=collection,
109 |         )
110 | 
111 | 
112 | class SchemaNotFoundError(SchemaError):
113 |     """Exception raised when a collection's schema cannot be retrieved."""
114 | 
115 |     def __init__(self, collection: str, details: str = None):
116 |         message = f"Schema for collection '{collection}' could not be retrieved"
117 |         if details:
118 |             message += f": {details}"
119 |         super().__init__(
120 |             message=message, error_type="SCHEMA_NOT_FOUND", collection=collection
121 |         )
122 | 
```
--------------------------------------------------------------------------------
/tests/unit/fixtures/zookeeper_fixtures.py:
--------------------------------------------------------------------------------
```python
 1 | """ZooKeeper fixtures for unit tests."""
 2 | 
 3 | from unittest.mock import MagicMock, patch
 4 | 
 5 | import pytest
 6 | from kazoo.client import KazooClient
 7 | from kazoo.exceptions import ConnectionLoss, NoNodeError
 8 | 
 9 | 
10 | @pytest.fixture(
11 |     params=["success", "no_collections", "empty", "error", "connection_error"]
12 | )
13 | def mock_kazoo_client(request):
14 |     """Parameterized KazooClient mock with different behavior scenarios."""
15 |     mock = MagicMock(spec=KazooClient)
16 | 
17 |     scenario = request.param
18 | 
19 |     if scenario == "success":
20 |         mock.get_children.return_value = ["collection1", "collection2"]
21 |         mock.exists.return_value = True
22 |         mock.start.return_value = None
23 |         mock.stop.return_value = None
24 |     elif scenario == "no_collections":
25 |         mock.exists.return_value = False
26 |         mock.get_children.side_effect = NoNodeError
27 |         mock.start.return_value = None
28 |         mock.stop.return_value = None
29 |     elif scenario == "empty":
30 |         mock.exists.return_value = True
31 |         mock.get_children.return_value = []
32 |         mock.start.return_value = None
33 |         mock.stop.return_value = None
34 |     elif scenario == "error":
35 |         mock.exists.return_value = True
36 |         mock.get_children.side_effect = ConnectionLoss("ZooKeeper error")
37 |         mock.start.return_value = None
38 |         mock.stop.return_value = None
39 |     elif scenario == "connection_error":
40 |         mock.start.side_effect = ConnectionLoss("ZooKeeper connection error")
41 |         mock.stop.return_value = None
42 | 
43 |     yield mock, scenario
44 | 
45 | 
46 | @pytest.fixture
47 | def mock_kazoo_client_factory(request):
48 |     """Factory for creating KazooClient mocks with specific behavior."""
49 | 
50 |     def _create_client(scenario="success"):
51 |         mock = MagicMock(spec=KazooClient)
52 | 
53 |         if scenario == "success":
54 |             mock.get_children.return_value = ["collection1", "collection2"]
55 |             mock.exists.return_value = True
56 |             mock.start.return_value = None
57 |             mock.stop.return_value = None
58 |         elif scenario == "no_collections":
59 |             mock.exists.return_value = False
60 |             mock.get_children.side_effect = NoNodeError
61 |             mock.start.return_value = None
62 |             mock.stop.return_value = None
63 |         elif scenario == "empty":
64 |             mock.exists.return_value = True
65 |             mock.get_children.return_value = []
66 |             mock.start.return_value = None
67 |             mock.stop.return_value = None
68 |         elif scenario == "error":
69 |             mock.exists.return_value = True
70 |             mock.get_children.side_effect = ConnectionLoss("ZooKeeper error")
71 |             mock.start.return_value = None
72 |             mock.stop.return_value = None
73 |         elif scenario == "connection_error":
74 |             mock.start.side_effect = ConnectionLoss("ZooKeeper connection error")
75 |             mock.stop.return_value = None
76 | 
77 |         return mock
78 | 
79 |     scenario = getattr(request, "param", "success")
80 |     mock_client = _create_client(scenario)
81 | 
82 |     with patch("solr_mcp.solr.zookeeper.KazooClient", return_value=mock_client):
83 |         yield _create_client
84 | 
85 | 
86 | @pytest.fixture
87 | def provider(mock_kazoo_client_factory):
88 |     """Create ZooKeeperCollectionProvider instance with mocked dependencies."""
89 |     from solr_mcp.solr.zookeeper import ZooKeeperCollectionProvider
90 | 
91 |     provider = ZooKeeperCollectionProvider(hosts=["localhost:2181"])
92 |     # The KazooClient is already mocked via the factory fixture
93 |     return provider
94 | 
```
--------------------------------------------------------------------------------
/scripts/check_solr.py:
--------------------------------------------------------------------------------
```python
 1 | #!/usr/bin/env python3
 2 | """
 3 | Script to check Solr configuration and status.
 4 | """
 5 | 
 6 | import asyncio
 7 | import httpx
 8 | import json
 9 | import sys
10 | 
11 | 
12 | async def check_solr_collections():
13 |     """Check Solr collections and their configuration."""
14 |     try:
15 |         async with httpx.AsyncClient() as client:
16 |             # Get list of collections
17 |             response = await client.get(
18 |                 "http://localhost:8983/solr/admin/collections",
19 |                 params={"action": "LIST", "wt": "json"},
20 |                 timeout=10.0
21 |             )
22 |             
23 |             if response.status_code != 200:
24 |                 print(f"Error getting collections: {response.status_code} - {response.text}")
25 |                 return
26 |             
27 |             collections_data = response.json()
28 |             
29 |             if 'collections' in collections_data:
30 |                 collections = collections_data['collections']
31 |                 print(f"Found {len(collections)} collections: {', '.join(collections)}")
32 |                 
33 |                 # Check each collection
34 |                 for collection in collections:
35 |                     # Get schema information
36 |                     schema_response = await client.get(
37 |                         f"http://localhost:8983/solr/{collection}/schema",
38 |                         params={"wt": "json"},
39 |                         timeout=10.0
40 |                     )
41 |                     
42 |                     if schema_response.status_code != 200:
43 |                         print(f"Error getting schema for {collection}: {schema_response.status_code}")
44 |                         continue
45 |                     
46 |                     schema_data = schema_response.json()
47 |                     
48 |                     # Check for vector field type
49 |                     field_types = schema_data.get('schema', {}).get('fieldTypes', [])
50 |                     vector_type = None
51 |                     for ft in field_types:
52 |                         if ft.get('class') == 'solr.DenseVectorField':
53 |                             vector_type = ft
54 |                             break
55 |                     
56 |                     if vector_type:
57 |                         print(f"\nCollection '{collection}' has vector field type:")
58 |                         print(f"  Name: {vector_type.get('name')}")
59 |                         print(f"  Class: {vector_type.get('class')}")
60 |                         print(f"  Vector Dimension: {vector_type.get('vectorDimension')}")
61 |                         print(f"  Similarity Function: {vector_type.get('similarityFunction')}")
62 |                     else:
63 |                         print(f"\nCollection '{collection}' does not have a vector field type")
64 |                         
65 |                     # Check for vector fields
66 |                     fields = schema_data.get('schema', {}).get('fields', [])
67 |                     vector_fields = [f for f in fields if f.get('type') == 'knn_vector']
68 |                     
69 |                     if vector_fields:
70 |                         print(f"\n  Vector fields in '{collection}':")
71 |                         for field in vector_fields:
72 |                             print(f"    - {field.get('name')} (indexed: {field.get('indexed')}, stored: {field.get('stored')})")
73 |                     else:
74 |                         print(f"\n  No vector fields found in '{collection}'")
75 |             else:
76 |                 print("No collections found or invalid response format")
77 |     
78 |     except Exception as e:
79 |         print(f"Error checking Solr: {e}")
80 | 
81 | 
82 | if __name__ == "__main__":
83 |     asyncio.run(check_solr_collections())
```
--------------------------------------------------------------------------------
/tests/unit/test_parser.py:
--------------------------------------------------------------------------------
```python
 1 | """Unit tests for QueryParser."""
 2 | 
 3 | import pytest
 4 | 
 5 | from solr_mcp.solr.exceptions import QueryError
 6 | from solr_mcp.solr.query.parser import QueryParser
 7 | 
 8 | 
 9 | @pytest.fixture
10 | def query_parser():
11 |     """Create QueryParser instance for testing."""
12 |     return QueryParser()
13 | 
14 | 
15 | class TestQueryParser:
16 |     """Test cases for QueryParser."""
17 | 
18 |     def test_init(self, query_parser):
19 |         """Test QueryParser initialization."""
20 |         assert isinstance(query_parser, QueryParser)
21 | 
22 |     def test_preprocess_query_basic(self, query_parser):
23 |         """Test preprocessing basic field:value syntax."""
24 |         query = "SELECT * FROM collection1 WHERE field:value"
25 |         result = query_parser.preprocess_query(query)
26 |         assert "field = 'value'" in result
27 | 
28 |     def test_preprocess_query_multiple(self, query_parser):
29 |         """Test preprocessing multiple field:value pairs."""
30 |         query = "SELECT * FROM collection1 WHERE field1:value1 AND field2:value2"
31 |         result = query_parser.preprocess_query(query)
32 |         assert "field1 = 'value1'" in result
33 |         assert "field2 = 'value2'" in result
34 | 
35 |     def test_parse_select_valid(self, query_parser):
36 |         """Test parsing valid SELECT query."""
37 |         query = "SELECT id, title FROM collection1"
38 |         ast, collection, fields = query_parser.parse_select(query)
39 | 
40 |         assert ast is not None
41 |         assert collection == "collection1"
42 |         assert fields == ["id", "title"]
43 | 
44 |     def test_parse_select_no_select(self, query_parser):
45 |         """Test parsing non-SELECT query."""
46 |         query = "INSERT INTO collection1 (id) VALUES (1)"
47 | 
48 |         with pytest.raises(QueryError) as exc_info:
49 |             query_parser.parse_select(query)
50 |         assert exc_info.type == QueryError
51 | 
52 |     def test_parse_select_no_from(self, query_parser):
53 |         """Test parsing query without FROM clause."""
54 |         query = "SELECT id, title"
55 | 
56 |         with pytest.raises(QueryError) as exc_info:
57 |             query_parser.parse_select(query)
58 |         assert exc_info.type == QueryError
59 | 
60 |     def test_parse_select_with_alias(self, query_parser):
61 |         """Test parsing query with aliased fields."""
62 |         query = "SELECT id as doc_id, title as doc_title FROM collection1"
63 |         ast, collection, fields = query_parser.parse_select(query)
64 | 
65 |         assert ast is not None
66 |         assert collection == "collection1"
67 |         assert "doc_id" in fields
68 |         assert "doc_title" in fields
69 | 
70 |     def test_parse_select_with_star(self, query_parser):
71 |         """Test parsing query with * selector."""
72 |         query = "SELECT * FROM collection1"
73 |         ast, collection, fields = query_parser.parse_select(query)
74 | 
75 |         assert ast is not None
76 |         assert collection == "collection1"
77 |         assert "*" in fields
78 | 
79 |     def test_parse_select_invalid_syntax(self, query_parser):
80 |         """Test parsing query with invalid syntax."""
81 |         query = "INVALID SQL"
82 | 
83 |         with pytest.raises(QueryError) as exc_info:
84 |             query_parser.parse_select(query)
85 |         assert exc_info.type == QueryError
86 | 
87 |     def test_extract_sort_fields_single(self, query_parser):
88 |         """Test extracting fields from single sort specification."""
89 |         sort_spec = "title desc"
90 |         fields = query_parser.extract_sort_fields(sort_spec)
91 |         assert fields == ["title"]
92 | 
93 |     def test_extract_sort_fields_multiple(self, query_parser):
94 |         """Test extracting fields from multiple sort specifications."""
95 |         sort_spec = "title desc, id asc"
96 |         fields = query_parser.extract_sort_fields(sort_spec)
97 |         assert fields == ["title", "id"]
98 | 
```
--------------------------------------------------------------------------------
/scripts/prepare_data.py:
--------------------------------------------------------------------------------
```python
 1 | #!/usr/bin/env python3
 2 | """
 3 | Script to prepare data for indexing in Solr with dynamic field naming conventions.
 4 | """
 5 | 
 6 | import argparse
 7 | import json
 8 | import sys
 9 | import os
10 | from datetime import datetime
11 | 
12 | def prepare_data_for_solr(input_file, output_file):
13 |     """
14 |     Modify field names to use Solr dynamic field naming conventions.
15 |     
16 |     Args:
17 |         input_file: Path to the input JSON file
18 |         output_file: Path to the output JSON file
19 |     """
20 |     # Load the input data
21 |     with open(input_file, 'r', encoding='utf-8') as f:
22 |         data = json.load(f)
23 |     
24 |     # Transform the data
25 |     transformed_data = []
26 |     for doc in data:
27 |         transformed_doc = {}
28 |         
29 |         # Map fields to appropriate dynamic field suffixes
30 |         for key, value in doc.items():
31 |             if key == 'id' or key == 'title' or key == 'text' or key == 'source':
32 |                 # Keep standard fields as they are
33 |                 transformed_doc[key] = value
34 |             elif key == 'section_number':
35 |                 # Integer fields get _i suffix
36 |                 transformed_doc['section_number_i'] = value
37 |             elif key == 'date_indexed':
38 |                 # Date fields get _dt suffix and need proper Solr format
39 |                 # Convert to Solr format YYYY-MM-DDThh:mm:ssZ
40 |                 # If already a string, ensure it's in the right format
41 |                 if isinstance(value, str):
42 |                     # Truncate microseconds if present 
43 |                     if '.' in value:
44 |                         parts = value.split('.')
45 |                         value = parts[0] + 'Z'
46 |                     elif not value.endswith('Z'):
47 |                         value = value + 'Z'
48 |                 transformed_doc[f'{key}_dt'] = value
49 |             elif key == 'date':
50 |                 # Ensure date has proper format
51 |                 if isinstance(value, str):
52 |                     # If just a date (YYYY-MM-DD), add time
53 |                     if len(value) == 10 and value.count('-') == 2:
54 |                         value = value + 'T00:00:00Z'
55 |                     # If it has time but no Z, add Z
56 |                     elif 'T' in value and not value.endswith('Z'):
57 |                         value = value + 'Z'
58 |                 transformed_doc[f'{key}_dt'] = value
59 |             elif key == 'tags' or key == 'category':
60 |                 # Multi-valued string fields get _ss suffix
61 |                 transformed_doc[f'{key}_ss'] = value
62 |             elif key == 'author':
63 |                 # String fields get _s suffix
64 |                 transformed_doc[f'{key}_s'] = value
65 |             else:
66 |                 # Default: keep as is
67 |                 transformed_doc[key] = value
68 |         
69 |         transformed_data.append(transformed_doc)
70 |     
71 |     # Write the transformed data to output file
72 |     with open(output_file, 'w', encoding='utf-8') as f:
73 |         json.dump(transformed_data, f, indent=2)
74 |     
75 |     print(f"Prepared {len(transformed_data)} documents for Solr indexing")
76 |     print(f"Output saved to {output_file}")
77 | 
78 | if __name__ == "__main__":
79 |     parser = argparse.ArgumentParser(description="Prepare data for Solr indexing")
80 |     parser.add_argument("input_file", help="Path to the input JSON file")
81 |     parser.add_argument("--output", "-o", default=None, help="Path to the output JSON file")
82 |     
83 |     args = parser.parse_args()
84 |     
85 |     # Generate output filename if not provided
86 |     if args.output is None:
87 |         input_name = os.path.basename(args.input_file)
88 |         name, ext = os.path.splitext(input_name)
89 |         args.output = f"data/processed/{name}_solr{ext}"
90 |     
91 |     prepare_data_for_solr(args.input_file, args.output)
```
--------------------------------------------------------------------------------
/tests/unit/tools/test_solr_default_vectorizer.py:
--------------------------------------------------------------------------------
```python
 1 | """Tests for solr_default_vectorizer tool."""
 2 | 
 3 | from unittest.mock import AsyncMock, MagicMock, patch
 4 | 
 5 | import pytest
 6 | 
 7 | from solr_mcp.tools.solr_default_vectorizer import get_default_text_vectorizer
 8 | from solr_mcp.vector_provider.constants import DEFAULT_OLLAMA_CONFIG, MODEL_DIMENSIONS
 9 | 
10 | 
11 | class TestDefaultVectorizerTool:
12 |     """Test cases for default_text_vectorizer tool."""
13 | 
14 |     @pytest.mark.asyncio
15 |     async def test_get_default_text_vectorizer_with_server(self):
16 |         """Test getting default vectorizer with a server instance."""
17 |         # Create mock server
18 |         mock_vector_manager = MagicMock()
19 |         mock_vector_manager.client.model = "nomic-embed-text"
20 |         mock_vector_manager.client.base_url = "http://test-host:8888"
21 | 
22 |         mock_solr_client = MagicMock()
23 |         mock_solr_client.vector_manager = mock_vector_manager
24 | 
25 |         mock_server = MagicMock()
26 |         mock_server.solr_client = mock_solr_client
27 | 
28 |         # Execute tool
29 |         result = await get_default_text_vectorizer(mock_server)
30 | 
31 |         # Verify result
32 |         assert result["vector_provider_model"] == "nomic-embed-text"
33 |         assert result["vector_provider_dimension"] == 768
34 |         assert result["vector_provider_host"] == "test-host"
35 |         assert result["vector_provider_port"] == 8888
36 |         assert result["vector_provider_url"] == "http://test-host:8888"
37 |         assert result["vector_provider_spec"] == "nomic-embed-text@test-host:8888"
38 | 
39 |     @pytest.mark.asyncio
40 |     async def test_get_default_text_vectorizer_without_server(self):
41 |         """Test getting default vectorizer without a server instance."""
42 |         # Create a server without vector_manager
43 |         mock_server = MagicMock(spec=["no_solr_client"])
44 | 
45 |         # Use patch to avoid trying to parse MagicMock as URL
46 |         with patch(
47 |             "solr_mcp.vector_provider.constants.DEFAULT_OLLAMA_CONFIG",
48 |             {
49 |                 "model": "nomic-embed-text",
50 |                 "base_url": "http://localhost:11434",
51 |                 "timeout": 30,
52 |                 "retries": 3,
53 |             },
54 |         ):
55 |             # Execute tool
56 |             result = await get_default_text_vectorizer(mock_server)
57 | 
58 |         # Verify result uses defaults
59 |         assert result["vector_provider_model"] == DEFAULT_OLLAMA_CONFIG["model"]
60 |         assert (
61 |             result["vector_provider_dimension"]
62 |             == MODEL_DIMENSIONS[DEFAULT_OLLAMA_CONFIG["model"]]
63 |         )
64 |         assert result["vector_provider_host"] == "localhost"
65 |         assert result["vector_provider_port"] == 11434
66 |         assert result["vector_provider_url"] == DEFAULT_OLLAMA_CONFIG["base_url"]
67 |         assert (
68 |             result["vector_provider_spec"]
69 |             == f"{DEFAULT_OLLAMA_CONFIG['model']}@localhost:11434"
70 |         )
71 | 
72 |     @pytest.mark.asyncio
73 |     async def test_get_default_text_vectorizer_unknown_model(self):
74 |         """Test getting default vectorizer with unknown model."""
75 |         # Create mock server
76 |         mock_vector_manager = MagicMock()
77 |         mock_vector_manager.client.model = "unknown-model"
78 |         mock_vector_manager.client.base_url = "http://test-host:8888"
79 | 
80 |         mock_solr_client = MagicMock()
81 |         mock_solr_client.vector_manager = mock_vector_manager
82 | 
83 |         mock_server = MagicMock()
84 |         mock_server.solr_client = mock_solr_client
85 | 
86 |         # Execute tool
87 |         result = await get_default_text_vectorizer(mock_server)
88 | 
89 |         # Verify result with default dimension for unknown model
90 |         assert result["vector_provider_model"] == "unknown-model"
91 |         assert result["vector_provider_dimension"] == 768  # Default dimension
92 |         assert result["vector_provider_spec"] == "unknown-model@test-host:8888"
93 | 
```
--------------------------------------------------------------------------------
/solr_mcp/solr/query/validator.py:
--------------------------------------------------------------------------------
```python
  1 | """Query validation for SolrCloud client."""
  2 | 
  3 | import logging
  4 | from typing import Any, Dict, List, Optional
  5 | 
  6 | from loguru import logger
  7 | from sqlglot.expressions import Select
  8 | 
  9 | from solr_mcp.solr.exceptions import QueryError
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | 
 14 | class QueryValidator:
 15 |     """Validates SQL queries for Solr."""
 16 | 
 17 |     def __init__(self, field_manager):
 18 |         """Initialize the QueryValidator.
 19 | 
 20 |         Args:
 21 |             field_manager: FieldManager instance for field validation
 22 |         """
 23 |         self.field_manager = field_manager
 24 | 
 25 |     def validate_fields(self, collection: str, fields: List[str]) -> None:
 26 |         """Validate that fields exist in the collection.
 27 | 
 28 |         Args:
 29 |             collection: Collection name
 30 |             fields: List of field names to validate
 31 | 
 32 |         Raises:
 33 |             QueryError: If fields are invalid
 34 |         """
 35 |         try:
 36 |             # Get available fields for collection
 37 |             available_fields = self.field_manager.get_field_types(collection)
 38 | 
 39 |             # Check each field exists
 40 |             for field in fields:
 41 |                 if field not in available_fields:
 42 |                     raise QueryError(
 43 |                         f"Invalid field '{field}' - field does not exist in collection '{collection}'"
 44 |                     )
 45 | 
 46 |         except QueryError:
 47 |             raise
 48 |         except Exception as e:
 49 |             raise QueryError(f"Field validation error: {str(e)}")
 50 | 
 51 |     def validate_sort_fields(self, collection: str, fields: List[str]) -> None:
 52 |         """Validate that fields are sortable in the collection.
 53 | 
 54 |         Args:
 55 |             collection: Collection name
 56 |             fields: List of field names to validate
 57 | 
 58 |         Raises:
 59 |             QueryError: If fields are not sortable
 60 |         """
 61 |         try:
 62 |             self.field_manager.validate_sort_fields(collection, fields)
 63 |         except Exception as e:
 64 |             raise QueryError(f"Sort field validation error: {str(e)}")
 65 | 
 66 |     def validate_sort(self, sort: Optional[str], collection: str) -> Optional[str]:
 67 |         """Validate and normalize sort parameter.
 68 | 
 69 |         Args:
 70 |             sort: Sort string in format "field direction" or just "field"
 71 |             collection: Collection name
 72 | 
 73 |         Returns:
 74 |             Validated sort string or None if sort is None
 75 | 
 76 |         Raises:
 77 |             QueryError: If sort specification is invalid
 78 |         """
 79 |         if not sort:
 80 |             return None
 81 | 
 82 |         parts = sort.strip().split()
 83 |         if len(parts) == 1:
 84 |             field = parts[0]
 85 |             direction = None
 86 |         elif len(parts) == 2:
 87 |             field, direction = parts
 88 |         else:
 89 |             raise QueryError(f"Invalid sort format: {sort}")
 90 | 
 91 |         try:
 92 |             # Get sortable fields for the collection
 93 |             field_info = self.field_manager.get_field_info(collection)
 94 |             sortable_fields = field_info["sortable_fields"]
 95 | 
 96 |             # Check if field is sortable
 97 |             if field not in sortable_fields:
 98 |                 raise QueryError(f"Field '{field}' is not sortable")
 99 | 
100 |             # Validate direction if provided
101 |             if direction:
102 |                 valid_directions = sortable_fields[field]["directions"]
103 |                 if direction.lower() not in [d.lower() for d in valid_directions]:
104 |                     raise QueryError(
105 |                         f"Invalid sort direction '{direction}' for field '{field}'"
106 |                     )
107 |             else:
108 |                 # Use default direction for field
109 |                 direction = sortable_fields[field]["default_direction"]
110 | 
111 |             return f"{field} {direction}"
112 |         except QueryError:
113 |             raise
114 |         except Exception as e:
115 |             raise QueryError(f"Sort field validation error: {str(e)}")
116 | 
```
--------------------------------------------------------------------------------
/solr_mcp/solr/utils/formatting.py:
--------------------------------------------------------------------------------
```python
  1 | """Utilities for formatting Solr search results."""
  2 | 
  3 | import json
  4 | import logging
  5 | from typing import Any, Dict, List, Optional, Union
  6 | 
  7 | import pysolr
  8 | 
  9 | from solr_mcp.solr.exceptions import QueryError, SolrError
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | 
 14 | def format_search_results(
 15 |     results: pysolr.Results,
 16 |     start: int = 0,
 17 |     include_score: bool = True,
 18 |     include_facets: bool = True,
 19 |     include_highlighting: bool = True,
 20 | ) -> str:
 21 |     """Format Solr search results for consumption.
 22 | 
 23 |     Args:
 24 |         results: pysolr Results object
 25 |         start: Start offset used in the search
 26 |         include_score: Whether to include score information
 27 |         include_facets: Whether to include facet information
 28 |         include_highlighting: Whether to include highlighting information
 29 | 
 30 |     Returns:
 31 |         Formatted results as JSON string
 32 |     """
 33 |     try:
 34 |         formatted = {
 35 |             "result-set": {
 36 |                 "numFound": results.hits,
 37 |                 "start": start,
 38 |                 "docs": list(results.docs) if hasattr(results, "docs") else [],
 39 |             }
 40 |         }
 41 | 
 42 |         # Include score information if requested and available
 43 |         if include_score and hasattr(results, "max_score"):
 44 |             formatted["result-set"]["maxScore"] = results.max_score
 45 | 
 46 |         # Include facets if requested and available
 47 |         if include_facets and hasattr(results, "facets") and results.facets:
 48 |             formatted["result-set"]["facets"] = results.facets
 49 | 
 50 |         # Include highlighting if requested and available
 51 |         if (
 52 |             include_highlighting
 53 |             and hasattr(results, "highlighting")
 54 |             and results.highlighting
 55 |         ):
 56 |             formatted["result-set"]["highlighting"] = results.highlighting
 57 | 
 58 |         try:
 59 |             return json.dumps(formatted, default=str)
 60 |         except TypeError as e:
 61 |             logger.error(f"JSON serialization error: {e}")
 62 |             # Fall back to basic result format
 63 |             return json.dumps(
 64 |                 {
 65 |                     "result-set": {
 66 |                         "numFound": results.hits,
 67 |                         "start": start,
 68 |                         "docs": (
 69 |                             [str(doc) for doc in results.docs]
 70 |                             if hasattr(results, "docs")
 71 |                             else []
 72 |                         ),
 73 |                     }
 74 |                 }
 75 |             )
 76 |     except Exception as e:
 77 |         logger.error(f"Error formatting search results: {e}")
 78 |         return json.dumps({"error": str(e)})
 79 | 
 80 | 
 81 | def format_sql_response(raw_response: Dict[str, Any]) -> Dict[str, Any]:
 82 |     """Format SQL query response to a standardized structure."""
 83 |     try:
 84 |         # Check for error response
 85 |         if "result-set" in raw_response and "docs" in raw_response["result-set"]:
 86 |             docs = raw_response["result-set"]["docs"]
 87 |             if len(docs) == 1 and "EXCEPTION" in docs[0]:
 88 |                 raise QueryError(docs[0]["EXCEPTION"])
 89 | 
 90 |         # Return standardized response format
 91 |         return {
 92 |             "result-set": {
 93 |                 "docs": raw_response.get("result-set", {}).get("docs", []),
 94 |                 "numFound": len(raw_response.get("result-set", {}).get("docs", [])),
 95 |                 "start": 0,
 96 |             }
 97 |         }
 98 |     except QueryError as e:
 99 |         raise e
100 |     except Exception as e:
101 |         raise QueryError(f"Error formatting SQL response: {str(e)}")
102 | 
103 | 
104 | def format_error_response(error: Exception) -> str:
105 |     """Format error response as JSON string.
106 | 
107 |     Args:
108 |         error: Exception object
109 | 
110 |     Returns:
111 |         Error message as JSON string
112 |     """
113 |     error_code = "INTERNAL_ERROR"
114 |     if isinstance(error, QueryError):
115 |         error_code = "QUERY_ERROR"
116 |     elif isinstance(error, SolrError):
117 |         error_code = "SOLR_ERROR"
118 | 
119 |     return json.dumps({"error": {"code": error_code, "message": str(error)}})
120 | 
```
--------------------------------------------------------------------------------
/tests/unit/fixtures/common.py:
--------------------------------------------------------------------------------
```python
  1 | """Common fixtures and mock data for unit tests."""
  2 | 
  3 | from typing import List, Optional
  4 | from unittest.mock import Mock
  5 | 
  6 | import pytest
  7 | 
  8 | from solr_mcp.solr.interfaces import CollectionProvider, VectorSearchProvider
  9 | 
 10 | # Mock response data with various levels of detail
 11 | MOCK_RESPONSES = {
 12 |     "collections": ["collection1", "collection2"],
 13 |     "select": {"result-set": {"docs": [{"id": "1", "field": "value"}], "numFound": 1}},
 14 |     "vector": {
 15 |         "result-set": {
 16 |             "docs": [{"id": "1", "field": "value", "score": 0.95}],
 17 |             "numFound": 1,
 18 |         }
 19 |     },
 20 |     "semantic": {
 21 |         "result-set": {
 22 |             "docs": [{"id": "1", "field": "value", "score": 0.85}],
 23 |             "numFound": 1,
 24 |         }
 25 |     },
 26 |     "schema": {
 27 |         "schema": {
 28 |             "fields": [
 29 |                 {
 30 |                     "name": "id",
 31 |                     "type": "string",
 32 |                     "multiValued": False,
 33 |                     "required": True,
 34 |                 },
 35 |                 {"name": "title", "type": "text_general", "multiValued": False},
 36 |                 {"name": "content", "type": "text_general", "multiValued": False},
 37 |                 {"name": "vector", "type": "knn_vector", "multiValued": False},
 38 |             ],
 39 |             "fieldTypes": [
 40 |                 {"name": "string", "class": "solr.StrField", "sortMissingLast": True},
 41 |                 {
 42 |                     "name": "text_general",
 43 |                     "class": "solr.TextField",
 44 |                     "positionIncrementGap": "100",
 45 |                 },
 46 |                 {
 47 |                     "name": "knn_vector",
 48 |                     "class": "solr.DenseVectorField",
 49 |                     "vectorDimension": 768,
 50 |                 },
 51 |             ],
 52 |         }
 53 |     },
 54 |     "field_list": {
 55 |         "fields": [
 56 |             {
 57 |                 "name": "id",
 58 |                 "type": "string",
 59 |                 "indexed": True,
 60 |                 "stored": True,
 61 |                 "docValues": True,
 62 |                 "multiValued": False,
 63 |             },
 64 |             {
 65 |                 "name": "_text_",
 66 |                 "type": "text_general",
 67 |                 "indexed": True,
 68 |                 "stored": False,
 69 |                 "docValues": False,
 70 |                 "multiValued": True,
 71 |                 "copies_from": ["title", "content"],
 72 |             },
 73 |         ]
 74 |     },
 75 | }
 76 | 
 77 | 
 78 | class MockCollectionProvider(CollectionProvider):
 79 |     """Mock implementation of CollectionProvider."""
 80 | 
 81 |     def __init__(self, collections=None):
 82 |         """Initialize with optional list of collections."""
 83 |         self.collections = (
 84 |             collections if collections is not None else MOCK_RESPONSES["collections"]
 85 |         )
 86 | 
 87 |     async def list_collections(self) -> List[str]:
 88 |         """Return mock list of collections."""
 89 |         return self.collections
 90 | 
 91 |     async def collection_exists(self, collection: str) -> bool:
 92 |         """Check if collection exists in mock list."""
 93 |         return collection in self.collections
 94 | 
 95 | 
 96 | class MockVectorProvider(VectorSearchProvider):
 97 |     """Mock vector provider for testing."""
 98 | 
 99 |     async def execute_vector_search(self, client, vector, top_k=10):
100 |         """Mock vector search execution."""
101 |         return {
102 |             "response": {
103 |                 "docs": [
104 |                     {"_docid_": "1", "score": 0.9, "_vector_distance_": 0.1},
105 |                     {"_docid_": "2", "score": 0.8, "_vector_distance_": 0.2},
106 |                     {"_docid_": "3", "score": 0.7, "_vector_distance_": 0.3},
107 |                 ],
108 |                 "numFound": 3,
109 |                 "start": 0,
110 |             }
111 |         }
112 | 
113 |     async def get_vector(self, text: str, model: Optional[str] = None) -> List[float]:
114 |         """Mock text to vector conversion."""
115 |         return [0.1, 0.2, 0.3]
116 | 
117 | 
118 | @pytest.fixture
119 | def valid_config_dict():
120 |     """Valid configuration dictionary."""
121 |     return {
122 |         "solr_base_url": "http://localhost:8983/solr",
123 |         "zookeeper_hosts": ["localhost:2181"],
124 |         "connection_timeout": 10,
125 |     }
126 | 
```
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
```yaml
  1 | version: '3.8'
  2 | 
  3 | services:
  4 |   zookeeper:
  5 |     image: zookeeper:3.9
  6 |     container_name: zookeeper
  7 |     ports:
  8 |       - "2181:2181"
  9 |     environment:
 10 |       ZOO_MY_ID: 1
 11 |       ZOO_SERVERS: server.1=zookeeper:2888:3888;2181
 12 |     volumes:
 13 |       - zookeeper_data:/data
 14 |       - zookeeper_logs:/datalog
 15 |     networks:
 16 |       - solr-net
 17 | 
 18 |   solr1:
 19 |     build:
 20 |       context: .
 21 |       dockerfile: solr.Dockerfile
 22 |     container_name: solr1
 23 |     ports:
 24 |       - "8983:8983"
 25 |     environment:
 26 |       - ZK_HOST=zookeeper:2181
 27 |       - SOLR_JAVA_MEM=-Xms512m -Xmx512m
 28 |     volumes:
 29 |       - solr1_data:/var/solr
 30 |     depends_on:
 31 |       - zookeeper
 32 |     networks:
 33 |       - solr-net
 34 |     command: 
 35 |       - solr-foreground
 36 |       - -c  # Run in cloud mode
 37 |     healthcheck:
 38 |       test: ["CMD", "wget", "-q", "--spider", "http://localhost:8983/solr/"]
 39 |       interval: 5s
 40 |       timeout: 10s
 41 |       retries: 5
 42 | 
 43 |   solr2:
 44 |     build:
 45 |       context: .
 46 |       dockerfile: solr.Dockerfile
 47 |     container_name: solr2
 48 |     ports:
 49 |       - "8984:8983"
 50 |     environment:
 51 |       - ZK_HOST=zookeeper:2181
 52 |       - SOLR_JAVA_MEM=-Xms512m -Xmx512m
 53 |     volumes:
 54 |       - solr2_data:/var/solr
 55 |     depends_on:
 56 |       - zookeeper
 57 |       - solr1
 58 |     networks:
 59 |       - solr-net
 60 |     command: 
 61 |       - solr-foreground
 62 |       - -c  # Run in cloud mode
 63 | 
 64 |   # Initializer service to set up Solr collections (runs once and exits)
 65 |   solr-init:
 66 |     image: solr:9.5
 67 |     container_name: solr-init
 68 |     depends_on:
 69 |       solr1:
 70 |         condition: service_healthy
 71 |       solr2:
 72 |         condition: service_started
 73 |     networks:
 74 |       - solr-net
 75 |     environment:
 76 |       - ZK_HOST=zookeeper:2181
 77 |       - SOLR_HOST=solr1
 78 |       - SOLR_PORT=8983
 79 |     volumes:
 80 |       - ./solr_config:/config
 81 |     command: >
 82 |       bash -c "
 83 |         # Wait for Solr to be available
 84 |         echo 'Waiting for Solr to be available...'
 85 |         until wget -q --spider http://solr1:8983/solr; do
 86 |           sleep 2
 87 |         done
 88 |         echo 'Solr is up!'
 89 |         
 90 |         # Delete existing collection if it exists
 91 |         echo 'Deleting existing unified collection if it exists...'
 92 |         curl -s 'http://solr1:8983/solr/admin/collections?action=DELETE&name=unified' || true
 93 |         
 94 |         # Upload the updated config
 95 |         echo 'Uploading updated configuration...'
 96 |         solr zk upconfig -n unified_config -d /config/unified -z zookeeper:2181
 97 |         
 98 |         # Create the unified collection with the updated config
 99 |         echo 'Creating unified collection...'
100 |         curl -s 'http://solr1:8983/solr/admin/collections?action=CREATE&name=unified&numShards=1&replicationFactor=1&collection.configName=unified_config' || echo 'Collection creation failed - check solr logs'
101 |         
102 |         echo 'Initialization complete!'
103 |       "
104 | 
105 |   # Ollama for embeddings
106 |   ollama:
107 |     image: ollama/ollama:latest
108 |     container_name: ollama
109 |     ports:
110 |       - "11434:11434"
111 |     volumes:
112 |       - ollama_data:/root/.ollama
113 |     networks:
114 |       - solr-net
115 |     # Run Ollama and pull the model
116 |     entrypoint: ["/bin/bash", "-c"]
117 |     command: >
118 |       "ollama serve &
119 |        sleep 15 &&
120 |        echo 'Pulling nomic-embed-text model...' &&
121 |        ollama pull nomic-embed-text &&
122 |        echo 'Model pulled successfully' &&
123 |        tail -f /dev/null"
124 |     healthcheck:
125 |       test: ["CMD-SHELL", "bash -c 'cat < /dev/null > /dev/tcp/localhost/11434'"]
126 |       interval: 15s
127 |       timeout: 5s
128 |       retries: 5
129 |       start_period: 90s
130 | 
131 |   # MCP Server
132 |   mcp-server:
133 |     build:
134 |       context: .
135 |       dockerfile: Dockerfile
136 |     container_name: mcp-server
137 |     ports:
138 |       - "8000:8000"
139 |     environment:
140 |       - SOLR_MCP_ZK_HOSTS=zookeeper:2181
141 |       - SOLR_MCP_SOLR_URL=http://solr1:8983/solr
142 |       - SOLR_MCP_DEFAULT_COLLECTION=unified
143 |       - OLLAMA_BASE_URL=http://ollama:11434
144 |     depends_on:
145 |       solr-init:
146 |         condition: service_completed_successfully
147 |       ollama:
148 |         condition: service_healthy
149 |     networks:
150 |       - solr-net
151 |     volumes:
152 |       - ./:/app
153 | 
154 | networks:
155 |   solr-net:
156 |     driver: bridge
157 | 
158 | volumes:
159 |   zookeeper_data:
160 |   zookeeper_logs:
161 |   solr1_data:
162 |   solr2_data:
163 |   ollama_data:
```
--------------------------------------------------------------------------------
/scripts/process_markdown.py:
--------------------------------------------------------------------------------
```python
  1 | #!/usr/bin/env python3
  2 | """
  3 | Script to process markdown files, splitting them by section headings
  4 | and preparing them for indexing in Solr with vector embeddings.
  5 | """
  6 | 
  7 | import argparse
  8 | import json
  9 | import os
 10 | import re
 11 | import sys
 12 | from datetime import datetime
 13 | from typing import Dict, List, Tuple
 14 | 
 15 | import frontmatter
 16 | 
 17 | 
 18 | def extract_sections(markdown_content: str) -> List[Tuple[str, str]]:
 19 |     """
 20 |     Extract sections from a markdown document based on headings.
 21 |     
 22 |     Args:
 23 |         markdown_content: The content of the markdown file
 24 |         
 25 |     Returns:
 26 |         List of tuples (section_title, section_content)
 27 |     """
 28 |     # Split by headers (# Header)
 29 |     header_pattern = r'^(#{1,6})\s+(.+?)$'
 30 |     lines = markdown_content.split('\n')
 31 |     
 32 |     sections = []
 33 |     current_title = "Introduction"
 34 |     current_content = []
 35 |     
 36 |     for line in lines:
 37 |         header_match = re.match(header_pattern, line, re.MULTILINE)
 38 |         
 39 |         if header_match:
 40 |             # Save previous section
 41 |             if current_content:
 42 |                 sections.append((current_title, '\n'.join(current_content).strip()))
 43 |                 current_content = []
 44 |             
 45 |             # Start new section
 46 |             current_title = header_match.group(2).strip()
 47 |         else:
 48 |             current_content.append(line)
 49 |     
 50 |     # Add the last section
 51 |     if current_content:
 52 |         sections.append((current_title, '\n'.join(current_content).strip()))
 53 |     
 54 |     return sections
 55 | 
 56 | 
 57 | def convert_to_solr_docs(sections: List[Tuple[str, str]], filename: str, metadata: Dict) -> List[Dict]:
 58 |     """
 59 |     Convert markdown sections to Solr documents.
 60 |     
 61 |     Args:
 62 |         sections: List of (title, content) tuples
 63 |         filename: Original filename
 64 |         metadata: Metadata from frontmatter
 65 |         
 66 |     Returns:
 67 |         List of documents ready for Solr indexing
 68 |     """
 69 |     documents = []
 70 |     
 71 |     for i, (title, content) in enumerate(sections):
 72 |         # Skip empty sections
 73 |         if not content.strip():
 74 |             continue
 75 |             
 76 |         doc = {
 77 |             "id": f"{os.path.basename(filename)}_section_{i}",
 78 |             "title": title,
 79 |             "text": content,
 80 |             "source": filename,
 81 |             "section_number": i,
 82 |             "date_indexed": datetime.now().isoformat(),
 83 |             "tags": metadata.get("tags", []),
 84 |             "category": metadata.get("categories", [])
 85 |         }
 86 |         
 87 |         # Add any additional metadata
 88 |         for key, value in metadata.items():
 89 |             if key not in ["tags", "categories"] and key not in doc:
 90 |                 doc[key] = value
 91 |         
 92 |         documents.append(doc)
 93 |     
 94 |     return documents
 95 | 
 96 | 
 97 | def process_markdown_file(file_path: str, output_file: str = None):
 98 |     """
 99 |     Process a markdown file, splitting it into sections and converting to Solr documents.
100 |     
101 |     Args:
102 |         file_path: Path to the markdown file
103 |         output_file: Path to save the JSON output (if None, prints to stdout)
104 |     """
105 |     # Read and parse markdown with frontmatter
106 |     with open(file_path, 'r', encoding='utf-8') as f:
107 |         post = frontmatter.load(f)
108 |     
109 |     # Extract frontmatter metadata and content
110 |     metadata = dict(post.metadata)
111 |     content = post.content
112 |     
113 |     # Extract sections
114 |     sections = extract_sections(content)
115 |     
116 |     # Convert to Solr documents
117 |     documents = convert_to_solr_docs(sections, file_path, metadata)
118 |     
119 |     # Output
120 |     if output_file:
121 |         with open(output_file, 'w', encoding='utf-8') as f:
122 |             json.dump(documents, f, indent=2)
123 |         print(f"Processed {file_path} into {len(documents)} sections and saved to {output_file}")
124 |     else:
125 |         print(json.dumps(documents, indent=2))
126 |         print(f"Processed {file_path} into {len(documents)} sections", file=sys.stderr)
127 | 
128 | 
129 | if __name__ == "__main__":
130 |     parser = argparse.ArgumentParser(description="Process markdown files for Solr indexing")
131 |     parser.add_argument("file", help="Path to the markdown file")
132 |     parser.add_argument("--output", "-o", help="Output JSON file path")
133 |     
134 |     args = parser.parse_args()
135 |     
136 |     process_markdown_file(args.file, args.output)
```
--------------------------------------------------------------------------------
/tests/unit/test_formatting.py:
--------------------------------------------------------------------------------
```python
  1 | """Unit tests for formatting utilities."""
  2 | 
  3 | import json
  4 | from unittest.mock import Mock
  5 | 
  6 | from solr_mcp.solr.exceptions import QueryError, SolrError
  7 | from solr_mcp.solr.utils.formatting import (
  8 |     format_error_response,
  9 |     format_search_results,
 10 |     format_sql_response,
 11 | )
 12 | 
 13 | 
 14 | class TestFormatting:
 15 |     """Test cases for formatting utilities."""
 16 | 
 17 |     def test_format_search_results(self):
 18 |         """Test formatting Solr search results."""
 19 |         # Create mock pysolr Results
 20 |         mock_results = Mock()
 21 |         mock_results.docs = [
 22 |             {"id": "1", "title": "Test 1"},
 23 |             {"id": "2", "title": "Test 2"},
 24 |         ]
 25 |         mock_results.hits = 2
 26 |         mock_results.raw_response = {
 27 |             "response": {
 28 |                 "docs": mock_results.docs,
 29 |                 "numFound": mock_results.hits,
 30 |                 "start": 0,
 31 |             }
 32 |         }
 33 | 
 34 |         formatted = format_search_results(mock_results, start=0)
 35 |         result_dict = json.loads(formatted)
 36 | 
 37 |         assert "result-set" in result_dict
 38 |         assert result_dict["result-set"]["docs"] == mock_results.docs
 39 |         assert result_dict["result-set"]["numFound"] == mock_results.hits
 40 |         assert result_dict["result-set"]["start"] == 0
 41 | 
 42 |     def test_format_search_results_empty(self):
 43 |         """Test formatting empty search results."""
 44 |         mock_results = Mock()
 45 |         mock_results.docs = []
 46 |         mock_results.hits = 0
 47 |         mock_results.raw_response = {
 48 |             "response": {"docs": [], "numFound": 0, "start": 0}
 49 |         }
 50 | 
 51 |         formatted = format_search_results(mock_results, start=0)
 52 |         result_dict = json.loads(formatted)
 53 | 
 54 |         assert "result-set" in result_dict
 55 |         assert result_dict["result-set"]["docs"] == []
 56 |         assert result_dict["result-set"]["numFound"] == 0
 57 |         assert result_dict["result-set"]["start"] == 0
 58 | 
 59 |     def test_format_sql_response(self):
 60 |         """Test formatting SQL query response."""
 61 |         response = {
 62 |             "result-set": {
 63 |                 "docs": [
 64 |                     {"id": "1", "title": "Test 1"},
 65 |                     {"id": "2", "title": "Test 2"},
 66 |                 ],
 67 |                 "numFound": 2,
 68 |                 "start": 0,
 69 |             }
 70 |         }
 71 | 
 72 |         formatted = format_sql_response(response)
 73 | 
 74 |         assert formatted == response
 75 |         assert "result-set" in formatted
 76 |         assert formatted["result-set"]["numFound"] == 2
 77 |         assert len(formatted["result-set"]["docs"]) == 2
 78 | 
 79 |     def test_format_sql_response_empty(self):
 80 |         """Test formatting empty SQL query response."""
 81 |         response = {"result-set": {"docs": [], "numFound": 0, "start": 0}}
 82 | 
 83 |         formatted = format_sql_response(response)
 84 | 
 85 |         assert formatted == response
 86 |         assert "result-set" in formatted
 87 |         assert formatted["result-set"]["numFound"] == 0
 88 |         assert formatted["result-set"]["docs"] == []
 89 | 
 90 |     def test_format_error_response_query_error(self):
 91 |         """Test formatting QueryError response."""
 92 |         error = QueryError("Invalid SQL syntax")
 93 |         formatted = format_error_response(error)
 94 |         error_dict = json.loads(formatted)
 95 | 
 96 |         assert "error" in error_dict
 97 |         assert error_dict["error"]["code"] == "QUERY_ERROR"
 98 |         assert error_dict["error"]["message"] == "Invalid SQL syntax"
 99 | 
100 |     def test_format_error_response_solr_error(self):
101 |         """Test formatting SolrError response."""
102 |         error = SolrError("Connection failed")
103 |         formatted = format_error_response(error)
104 |         error_dict = json.loads(formatted)
105 | 
106 |         assert "error" in error_dict
107 |         assert error_dict["error"]["code"] == "SOLR_ERROR"
108 |         assert error_dict["error"]["message"] == "Connection failed"
109 | 
110 |     def test_format_error_response_generic_error(self):
111 |         """Test formatting generic error response."""
112 |         error = Exception("Unknown error")
113 |         formatted = format_error_response(error)
114 |         error_dict = json.loads(formatted)
115 | 
116 |         assert "error" in error_dict
117 |         assert error_dict["error"]["code"] == "INTERNAL_ERROR"
118 |         assert "Unknown error" in error_dict["error"]["message"]
119 | 
```
--------------------------------------------------------------------------------
/solr_mcp/solr/vector/results.py:
--------------------------------------------------------------------------------
```python
  1 | """Vector search results handling."""
  2 | 
  3 | from typing import Any, Dict, List, Optional
  4 | 
  5 | from pydantic import BaseModel, Field
  6 | 
  7 | 
  8 | class VectorSearchResult(BaseModel):
  9 |     """Individual vector search result."""
 10 | 
 11 |     docid: str = Field(description="Internal Solr document ID (_docid_)")
 12 |     score: float = Field(description="Search score")
 13 |     distance: Optional[float] = Field(None, description="Vector distance if available")
 14 |     metadata: Dict[str, Any] = Field(
 15 |         default_factory=dict, description="Additional metadata"
 16 |     )
 17 | 
 18 |     def __getitem__(self, key):
 19 |         """Make result subscriptable."""
 20 |         if key == "docid":
 21 |             return self.docid
 22 |         elif key == "score":
 23 |             return self.score
 24 |         elif key == "distance":
 25 |             return self.distance
 26 |         elif key == "metadata":
 27 |             return self.metadata
 28 |         raise KeyError(f"Invalid key: {key}")
 29 | 
 30 | 
 31 | class VectorSearchResults(BaseModel):
 32 |     """Container for vector search results."""
 33 | 
 34 |     results: List[VectorSearchResult] = Field(
 35 |         default_factory=list, description="List of search results"
 36 |     )
 37 |     total_found: int = Field(0, description="Total number of results found")
 38 |     top_k: int = Field(..., description="Number of results requested")
 39 |     query_time_ms: Optional[int] = Field(
 40 |         None, description="Query execution time in milliseconds"
 41 |     )
 42 | 
 43 |     @property
 44 |     def docs(self) -> List[VectorSearchResult]:
 45 |         """Get list of search results."""
 46 |         return self.results
 47 | 
 48 |     @classmethod
 49 |     def from_solr_response(
 50 |         cls, response: Dict[str, Any], top_k: int = 10
 51 |     ) -> "VectorSearchResults":
 52 |         """Create VectorSearchResults from Solr response.
 53 | 
 54 |         Args:
 55 |             response: Raw Solr response dictionary
 56 |             top_k: Number of results requested
 57 | 
 58 |         Returns:
 59 |             VectorSearchResults instance
 60 |         """
 61 |         # Extract response header
 62 |         header = response.get("responseHeader", {})
 63 |         query_time = header.get("QTime")
 64 | 
 65 |         # Extract main response section
 66 |         resp = response.get("response", {})
 67 |         docs = resp.get("docs", [])
 68 | 
 69 |         # Create results list
 70 |         results = []
 71 |         for doc in docs:
 72 |             # Handle both string and numeric _docid_
 73 |             docid = doc.get("_docid_")
 74 |             if docid is None:
 75 |                 # Try alternate field names
 76 |                 docid = doc.get("[docid]") or doc.get("docid") or "0"
 77 |             docid = str(docid)  # Ensure string type
 78 | 
 79 |             result = VectorSearchResult(
 80 |                 docid=docid,
 81 |                 score=doc.get("score", 0.0),
 82 |                 distance=doc.get("_vector_distance_"),
 83 |                 metadata={
 84 |                     k: v
 85 |                     for k, v in doc.items()
 86 |                     if k
 87 |                     not in ["_docid_", "[docid]", "docid", "score", "_vector_distance_"]
 88 |                 },
 89 |             )
 90 |             results.append(result)
 91 | 
 92 |         # Create VectorSearchResults
 93 |         return cls(
 94 |             results=results,
 95 |             total_found=resp.get("numFound", 0),
 96 |             top_k=top_k,
 97 |             query_time_ms=query_time,
 98 |         )
 99 | 
100 |     def to_dict(self) -> Dict[str, Any]:
101 |         """Convert results to dictionary format.
102 | 
103 |         Returns:
104 |             Dictionary representation of results
105 |         """
106 |         return {
107 |             "results": [result.model_dump() for result in self.results],
108 |             "metadata": {
109 |                 "total_found": self.total_found,
110 |                 "top_k": self.top_k,
111 |                 "query_time_ms": self.query_time_ms,
112 |             },
113 |         }
114 | 
115 |     def get_doc_ids(self) -> List[str]:
116 |         """Get list of document IDs from results.
117 | 
118 |         Returns:
119 |             List of document IDs
120 |         """
121 |         return [result.docid for result in self.results]
122 | 
123 |     def get_scores(self) -> List[float]:
124 |         """Get list of scores from results.
125 | 
126 |         Returns:
127 |             List of scores
128 |         """
129 |         return [result.score for result in self.results]
130 | 
131 |     def get_distances(self) -> List[Optional[float]]:
132 |         """Get list of vector distances from results.
133 | 
134 |         Returns:
135 |             List of distances (None if not available)
136 |         """
137 |         return [result.distance for result in self.results]
138 | 
```
--------------------------------------------------------------------------------
/scripts/vector_index_simple.py:
--------------------------------------------------------------------------------
```python
  1 | #!/usr/bin/env python3
  2 | """
  3 | Simple script for indexing documents with vector embeddings.
  4 | """
  5 | 
  6 | import argparse
  7 | import asyncio
  8 | import json
  9 | import os
 10 | import sys
 11 | import numpy as np
 12 | import httpx
 13 | from typing import Dict, List, Any
 14 | 
 15 | # Add the project root to the path
 16 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 17 | 
 18 | from solr_mcp.embeddings.client import OllamaClient
 19 | 
 20 | 
 21 | async def generate_embeddings(texts: List[str]) -> List[List[float]]:
 22 |     """Generate embeddings for a list of texts using Ollama.
 23 |     
 24 |     Args:
 25 |         texts: List of text strings to generate embeddings for
 26 |         
 27 |     Returns:
 28 |         List of embedding vectors
 29 |     """
 30 |     client = OllamaClient()
 31 |     embeddings = []
 32 |     
 33 |     print(f"Generating embeddings for {len(texts)} documents...")
 34 |     
 35 |     # Process in smaller batches to avoid overwhelming Ollama
 36 |     batch_size = 5
 37 |     for i in range(0, len(texts), batch_size):
 38 |         batch = texts[i:i+batch_size]
 39 |         print(f"Processing batch {i//batch_size + 1}/{(len(texts) + batch_size - 1)//batch_size}...")
 40 |         batch_embeddings = await client.get_embeddings(batch)
 41 |         embeddings.extend(batch_embeddings)
 42 |     
 43 |     return embeddings
 44 | 
 45 | 
 46 | async def index_documents(json_file: str, collection: str = "testvectors", commit: bool = True):
 47 |     """Index documents with vector embeddings.
 48 |     
 49 |     Args:
 50 |         json_file: Path to the JSON file containing documents
 51 |         collection: Solr collection name
 52 |         commit: Whether to commit after indexing
 53 |     """
 54 |     # Load documents
 55 |     with open(json_file, 'r', encoding='utf-8') as f:
 56 |         documents = json.load(f)
 57 |     
 58 |     # Extract text for embedding generation
 59 |     texts = []
 60 |     for doc in documents:
 61 |         if 'text' in doc:
 62 |             texts.append(doc['text'])
 63 |         elif 'content' in doc:
 64 |             texts.append(doc['content'])
 65 |         else:
 66 |             texts.append(doc.get('title', ''))
 67 |     
 68 |     # Generate embeddings
 69 |     embeddings = await generate_embeddings(texts)
 70 |     
 71 |     # Prepare documents for indexing
 72 |     solr_docs = []
 73 |     for i, doc in enumerate(documents):
 74 |         solr_doc = {
 75 |             'id': doc['id'],
 76 |             'title': doc['title'],
 77 |             'text': doc.get('text', doc.get('content', '')),
 78 |             'source': doc.get('source', 'unknown'),
 79 |             'vector_model': 'nomic-embed-text',
 80 |             'embedding': embeddings[i]
 81 |         }
 82 |         solr_docs.append(solr_doc)
 83 |     
 84 |     # Index each document separately (a workaround for vector field issues)
 85 |     print(f"Indexing {len(solr_docs)} documents to collection '{collection}'...")
 86 |     
 87 |     async with httpx.AsyncClient() as client:
 88 |         for i, doc in enumerate(solr_docs):
 89 |             solr_url = f"http://localhost:8983/solr/{collection}/update/json/docs"
 90 |             params = {"commit": "true"} if (commit and i == len(solr_docs) - 1) else {}
 91 |             
 92 |             try:
 93 |                 response = await client.post(
 94 |                     solr_url,
 95 |                     json=doc,
 96 |                     params=params,
 97 |                     timeout=30.0
 98 |                 )
 99 |                 
100 |                 if response.status_code != 200:
101 |                     print(f"Error indexing document {doc['id']}: {response.status_code} - {response.text}")
102 |                     return False
103 |                     
104 |                 print(f"Indexed document {i+1}/{len(solr_docs)}: {doc['id']}")
105 |                 
106 |             except Exception as e:
107 |                 print(f"Error indexing document {doc['id']}: {e}")
108 |                 return False
109 |     
110 |     print(f"Successfully indexed {len(solr_docs)} documents to collection '{collection}'")
111 |     return True
112 | 
113 | 
114 | async def main():
115 |     """Main entry point."""
116 |     parser = argparse.ArgumentParser(description="Index documents with vector embeddings")
117 |     parser.add_argument("json_file", help="Path to the JSON file containing documents")
118 |     parser.add_argument("--collection", "-c", default="testvectors", help="Solr collection name")
119 |     parser.add_argument("--no-commit", dest="commit", action="store_false", help="Don't commit after indexing")
120 |     
121 |     args = parser.parse_args()
122 |     
123 |     result = await index_documents(args.json_file, args.collection, args.commit)
124 |     sys.exit(0 if result else 1)
125 | 
126 | 
127 | if __name__ == "__main__":
128 |     asyncio.run(main())
```