This is page 6 of 7. Use http://codebase.md/chillbruhhh/crawl4ai-mcp?lines=true&page={x} to view the full context.
# Directory Structure
```
├── .dockerignore
├── .env.example
├── .gitattributes
├── .gitignore
├── crawled_pages.sql
├── Dockerfile
├── knowledge_graphs
│ ├── ai_hallucination_detector.py
│ ├── ai_script_analyzer.py
│ ├── hallucination_reporter.py
│ ├── knowledge_graph_validator.py
│ ├── parse_repo_into_neo4j.py
│ ├── query_knowledge_graph.py
│ └── test_script.py
├── LICENSE
├── neo4j
│ └── docker-neo4j
│ ├── .github
│ │ └── ISSUE_TEMPLATE
│ │ └── bug_report.md
│ ├── .gitignore
│ ├── build-docker-image.sh
│ ├── build-utils-common-functions.sh
│ ├── COPYRIGHT
│ ├── DEVELOPMENT.md
│ ├── devenv
│ ├── devenv.local.template
│ ├── docker-image-src
│ │ ├── 2.3
│ │ │ ├── docker-entrypoint.sh
│ │ │ └── Dockerfile
│ │ ├── 3.0
│ │ │ ├── docker-entrypoint.sh
│ │ │ └── Dockerfile
│ │ ├── 3.1
│ │ │ ├── docker-entrypoint.sh
│ │ │ └── Dockerfile
│ │ ├── 3.2
│ │ │ ├── docker-entrypoint.sh
│ │ │ └── Dockerfile
│ │ ├── 3.3
│ │ │ ├── docker-entrypoint.sh
│ │ │ └── Dockerfile
│ │ ├── 3.4
│ │ │ ├── docker-entrypoint.sh
│ │ │ └── Dockerfile
│ │ ├── 3.5
│ │ │ ├── coredb
│ │ │ │ ├── docker-entrypoint.sh
│ │ │ │ ├── Dockerfile
│ │ │ │ └── neo4j-plugins.json
│ │ │ └── neo4j-admin
│ │ │ ├── docker-entrypoint.sh
│ │ │ └── Dockerfile
│ │ ├── 4.0
│ │ │ ├── coredb
│ │ │ │ ├── docker-entrypoint.sh
│ │ │ │ └── Dockerfile
│ │ │ └── neo4j-admin
│ │ │ ├── docker-entrypoint.sh
│ │ │ └── Dockerfile
│ │ ├── 4.1
│ │ │ ├── coredb
│ │ │ │ ├── docker-entrypoint.sh
│ │ │ │ └── Dockerfile
│ │ │ └── neo4j-admin
│ │ │ ├── docker-entrypoint.sh
│ │ │ └── Dockerfile
│ │ ├── 4.2
│ │ │ ├── coredb
│ │ │ │ ├── docker-entrypoint.sh
│ │ │ │ ├── Dockerfile
│ │ │ │ └── neo4j-plugins.json
│ │ │ └── neo4j-admin
│ │ │ ├── docker-entrypoint.sh
│ │ │ └── Dockerfile
│ │ ├── 4.3
│ │ │ ├── coredb
│ │ │ │ ├── docker-entrypoint.sh
│ │ │ │ ├── Dockerfile
│ │ │ │ └── neo4j-plugins.json
│ │ │ └── neo4j-admin
│ │ │ ├── docker-entrypoint.sh
│ │ │ └── Dockerfile
│ │ ├── 4.4
│ │ │ ├── coredb
│ │ │ │ ├── docker-entrypoint.sh
│ │ │ │ ├── Dockerfile-debian
│ │ │ │ ├── Dockerfile-ubi9
│ │ │ │ ├── neo4j-admin-report.sh
│ │ │ │ └── neo4j-plugins.json
│ │ │ └── neo4j-admin
│ │ │ ├── docker-entrypoint.sh
│ │ │ ├── Dockerfile-debian
│ │ │ └── Dockerfile-ubi9
│ │ ├── 5
│ │ │ ├── coredb
│ │ │ │ ├── docker-entrypoint.sh
│ │ │ │ ├── Dockerfile-debian
│ │ │ │ ├── Dockerfile-ubi8
│ │ │ │ ├── Dockerfile-ubi9
│ │ │ │ ├── neo4j-admin-report.sh
│ │ │ │ └── neo4j-plugins.json
│ │ │ └── neo4j-admin
│ │ │ ├── docker-entrypoint.sh
│ │ │ ├── Dockerfile-debian
│ │ │ ├── Dockerfile-ubi8
│ │ │ └── Dockerfile-ubi9
│ │ ├── calver
│ │ │ ├── coredb
│ │ │ │ ├── docker-entrypoint.sh
│ │ │ │ ├── Dockerfile-debian
│ │ │ │ ├── Dockerfile-ubi9
│ │ │ │ ├── neo4j-admin-report.sh
│ │ │ │ └── neo4j-plugins.json
│ │ │ └── neo4j-admin
│ │ │ ├── docker-entrypoint.sh
│ │ │ ├── Dockerfile-debian
│ │ │ └── Dockerfile-ubi9
│ │ └── common
│ │ ├── semver.jq
│ │ └── utilities.sh
│ ├── generate-stub-plugin
│ │ ├── build.gradle.kts
│ │ ├── Dockerfile
│ │ ├── ExampleNeo4jPlugin.java
│ │ ├── Makefile
│ │ ├── README.md
│ │ └── settings.gradle.kts
│ ├── LICENSE
│ ├── Makefile
│ ├── pom.xml
│ ├── publish-neo4j-admin-image.sh
│ ├── publish-neo4j-admin-images.sh
│ ├── README.md
│ └── src
│ ├── main
│ │ └── resources
│ │ └── log4j.properties
│ └── test
│ ├── java
│ │ └── com
│ │ └── neo4j
│ │ └── docker
│ │ ├── coredb
│ │ │ ├── configurations
│ │ │ │ ├── Configuration.java
│ │ │ │ ├── Setting.java
│ │ │ │ ├── TestConfSettings.java
│ │ │ │ ├── TestExtendedConf.java
│ │ │ │ └── TestJVMAdditionalConfig.java
│ │ │ ├── plugins
│ │ │ │ ├── Neo4jPluginEnv.java
│ │ │ │ ├── StubPluginHelper.java
│ │ │ │ ├── TestBundledPluginInstallation.java
│ │ │ │ ├── TestPluginInstallation.java
│ │ │ │ └── TestSemVerPluginMatching.java
│ │ │ ├── TestAdminReport.java
│ │ │ ├── TestAuthentication.java
│ │ │ ├── TestBasic.java
│ │ │ ├── TestCausalCluster.java
│ │ │ ├── TestMounting.java
│ │ │ └── TestUpgrade.java
│ │ ├── neo4jadmin
│ │ │ ├── TestAdminBasic.java
│ │ │ ├── TestBackupRestore.java
│ │ │ ├── TestBackupRestore44.java
│ │ │ ├── TestDumpLoad.java
│ │ │ ├── TestDumpLoad44.java
│ │ │ └── TestReport.java
│ │ ├── TestDeprecationWarning.java
│ │ ├── TestDockerComposeSecrets.java
│ │ └── utils
│ │ ├── DatabaseIO.java
│ │ ├── HostFileHttpHandler.java
│ │ ├── HttpServerTestExtension.java
│ │ ├── Neo4jVersion.java
│ │ ├── Neo4jVersionTest.java
│ │ ├── Network.java
│ │ ├── SetContainerUser.java
│ │ ├── TemporaryFolderManager.java
│ │ ├── TemporaryFolderManagerTest.java
│ │ ├── TestSettings.java
│ │ └── WaitStrategies.java
│ └── resources
│ ├── causal-cluster-compose.yml
│ ├── confs
│ │ ├── before50
│ │ │ ├── ConfsNotOverridden.conf
│ │ │ ├── ConfsReplaced.conf
│ │ │ ├── EnterpriseOnlyNotOverwritten.conf
│ │ │ ├── EnvVarsOverride.conf
│ │ │ ├── ExtendedConf.conf
│ │ │ ├── InvalidExtendedConf.conf
│ │ │ ├── JvmAdditionalNotOverridden.conf
│ │ │ ├── NoNewline.conf
│ │ │ └── ReadConf.conf
│ │ ├── ConfsNotOverridden.conf
│ │ ├── ConfsReplaced.conf
│ │ ├── EnterpriseOnlyNotOverwritten.conf
│ │ ├── EnvVarsOverride.conf
│ │ ├── ExtendedConf.conf
│ │ ├── InvalidExtendedConf.conf
│ │ ├── JvmAdditionalNotOverridden.conf
│ │ ├── NoNewline.conf
│ │ └── ReadConf.conf
│ ├── dockersecrets
│ │ ├── container-compose-with-incorrect-secrets.yml
│ │ ├── container-compose-with-secrets-override.yml
│ │ ├── container-compose-with-secrets.yml
│ │ ├── simple-container-compose-with-external-file-var.yml
│ │ └── simple-container-compose.yml
│ ├── ha-cluster-compose.yml
│ └── stubplugin
│ └── myPlugin.jar
├── pyproject.toml
├── README.md
├── src
│ ├── crawl4ai_mcp.py
│ └── utils.py
└── uv.lock
```
# Files
--------------------------------------------------------------------------------
/neo4j/docker-neo4j/docker-image-src/4.4/coredb/docker-entrypoint.sh:
--------------------------------------------------------------------------------
```bash
1 | #!/bin/bash -eu
2 |
3 | cmd="$1"
4 |
5 | # load useful utility functions
6 | . /startup/utilities.sh
7 |
8 | function is_readable
9 | {
10 | # this code is fairly ugly but works no matter who this script is running as.
11 | # It would be nice if the writability tests could use this logic somehow.
12 | local _file=${1}
13 | perm=$(stat -c %a "${_file}")
14 |
15 | # everyone permission
16 | if [[ ${perm:2:1} -ge 4 ]]; then
17 | return 0
18 | fi
19 | # owner permissions
20 | if [[ ${perm:0:1} -ge 4 ]]; then
21 | if [[ "$(stat -c %U ${_file})" = "${userid}" ]] || [[ "$(stat -c %u ${_file})" = "${userid}" ]]; then
22 | return 0
23 | fi
24 | fi
25 | # group permissions
26 | if [[ ${perm:1:1} -ge 4 ]]; then
27 | if containsElement "$(stat -c %g ${_file})" "${groups[@]}" || containsElement "$(stat -c %G ${_file})" "${groups[@]}" ; then
28 | return 0
29 | fi
30 | fi
31 | return 1
32 | }
33 |
34 | function is_writable
35 | {
36 | # It would be nice if this and the is_readable function could combine somehow
37 | local _file=${1}
38 | perm=$(stat -c %a "${_file}")
39 |
40 | # everyone permission
41 | if containsElement ${perm:2:1} 2 3 6 7; then
42 | return 0
43 | fi
44 | # owner permissions
45 | if containsElement ${perm:0:1} 2 3 6 7; then
46 | if [[ "$(stat -c %U ${_file})" = "${userid}" ]] || [[ "$(stat -c %u ${_file})" = "${userid}" ]]; then
47 | return 0
48 | fi
49 | fi
50 | # group permissions
51 | if containsElement ${perm:1:1} 2 3 6 7; then
52 | if containsElement "$(stat -c %g ${_file})" "${groups[@]}" || containsElement "$(stat -c %G ${_file})" "${groups[@]}" ; then
53 | return 0
54 | fi
55 | fi
56 | return 1
57 | }
58 |
59 | function check_mounted_folder_readable
60 | {
61 | local _directory=${1}
62 | debug_msg "checking ${_directory} is readable"
63 | if ! is_readable "${_directory}"; then
64 | print_permissions_advice_and_fail "${_directory}" "${userid}" "${groupid}"
65 | fi
66 | }
67 |
68 | function check_mounted_folder_writable_with_chown
69 | {
70 | # The /data and /log directory are a bit different because they are very likely to be mounted by the user but not
71 | # necessarily writable.
72 | # This depends on whether a user ID is passed to the container and which folders are mounted.
73 | #
74 | # No user ID passed to container:
75 | # 1) No folders are mounted.
76 | # The /data and /log folder are owned by neo4j by default, so should be writable already.
77 | # 2) Both /log and /data are mounted.
78 | # This means on start up, /data and /logs are owned by an unknown user and we should chown them to neo4j for
79 | # backwards compatibility.
80 | #
81 | # User ID passed to container:
82 | # 1) Both /data and /logs are mounted
83 | # The /data and /logs folders are owned by an unknown user but we *should* have rw permission to them.
84 | # That should be verified and error (helpfully) if not.
85 | # 2) User mounts /data or /logs *but not both*
86 | # The unmounted folder is still owned by neo4j, which should already be writable. The mounted folder should
87 | # have rw permissions through user id. This should be verified.
88 | # 3) No folders are mounted.
89 | # The /data and /log folder are owned by neo4j by default, and these are already writable by the user.
90 | # (This is a very unlikely use case).
91 |
92 | local mountFolder=${1}
93 | debug_msg "checking ${mountFolder} is writable"
94 | if running_as_root && ! secure_mode_enabled; then
95 | # check folder permissions
96 | if ! is_writable "${mountFolder}" ; then
97 | # warn that we're about to chown the folder and then chown it
98 | echo "Warning: Folder mounted to \"${mountFolder}\" is not writable from inside container. Changing folder owner to ${userid}."
99 | chown -R "${userid}":"${groupid}" "${mountFolder}"
100 | # check permissions on files in the folder
101 | elif [ $(su-exec "${userid}":"${groupid}" find "${mountFolder}" -not -writable | wc -l) -gt 0 ]; then
102 | echo "Warning: Some files inside \"${mountFolder}\" are not writable from inside container. Changing folder owner to ${userid}."
103 | chown -R "${userid}":"${groupid}" "${mountFolder}"
104 | fi
105 | else
106 | if [[ ! -w "${mountFolder}" ]] && [[ "$(stat -c %U ${mountFolder})" != "neo4j" ]]; then
107 | print_permissions_advice_and_fail "${mountFolder}" "${userid}" "${groupid}"
108 | fi
109 | fi
110 | }
111 |
112 | function load_plugin_from_location
113 | {
114 | # Install a plugin from location at runtime.
115 | local _plugin_name="${1}"
116 | local _location="${2}"
117 |
118 | local _plugins_dir="${NEO4J_HOME}/plugins"
119 | if [ -d /plugins ]; then
120 | local _plugins_dir="/plugins"
121 | fi
122 |
123 | local _destination="${_plugins_dir}/${_plugin_name}.jar"
124 |
125 | # Now we install the plugin that is shipped with Neo4j
126 | for filename in ${_location}; do
127 | echo "Installing Plugin '${_plugin_name}' from ${_location} to ${_destination}"
128 | cp --preserve "${filename}" "${_destination}"
129 | chmod +rw ${_destination}
130 | done
131 |
132 | if ! is_readable "${_destination}"; then
133 | echo >&2 "Plugin at '${_destination}' is not readable"
134 | exit 1
135 | fi
136 | }
137 |
138 | function load_plugin_from_url
139 | {
140 | # Load a plugin at runtime. The provided github repository must have a versions.json on the master branch with the
141 | # correct format.
142 | local _plugin_name="${1}" #e.g. apoc, graph-algorithms, graph-ql
143 |
144 | local _plugins_dir="${NEO4J_HOME}/plugins"
145 | if [ -d /plugins ]; then
146 | local _plugins_dir="/plugins"
147 | fi
148 | local _versions_json_url="$(jq --raw-output "with_entries( select(.key==\"${_plugin_name}\") ) | to_entries[] | .value.versions" /startup/neo4j-plugins.json )"
149 | debug_msg "Will read ${_plugin_name} versions.json from ${_versions_json_url}"
150 | # Using the same name for the plugin irrespective of version ensures we don't end up with different versions of the same plugin
151 | local _destination="${_plugins_dir}/${_plugin_name}.jar"
152 | local _neo4j_version="$(neo4j --version | cut -d' ' -f2)"
153 |
154 | # Now we call out to github to get the versions.json for this plugin and we parse that to find the url for the correct plugin jar for our neo4j version
155 | echo "Fetching versions.json for Plugin '${_plugin_name}' from ${_versions_json_url}"
156 | local _versions_json
157 | if ! _versions_json="$(wget -q --timeout 300 --tries 30 -O - "${_versions_json_url}")"; then
158 | debug_msg "ERROR: could not fetch '${_versions_json}'"
159 | echo >&2 "ERROR: could not query ${_versions_json_url} for plugin compatibility information.
160 | This could indicate a problem with your network or this container's network settings.
161 | Neo4j will continue to start, but \"${_plugin_name}\" will not be loaded."
162 | return 1
163 | fi
164 | local _plugin_jar_url="$(echo "${_versions_json}" | jq -L/startup --raw-output "import \"semver\" as lib; [ .[] | select(.neo4j|lib::semver(\"${_neo4j_version}\")) ] | min_by(.neo4j) | .jar")"
165 | if [[ -z "${_plugin_jar_url}" ]] || [[ "${_plugin_jar_url}" == "null" ]]; then
166 | debug_msg "ERROR: '${_versions_json_url}' does not contain an entry for ${_neo4j_version}"
167 | echo >&2 "ERROR: No compatible \"${_plugin_name}\" plugin found for Neo4j ${_neo4j_version} ${NEO4J_EDITION}.
168 | This can happen with the newest Neo4j versions when a compatible plugin has not yet been released.
169 | You can either use an older version of Neo4j, or continue without ${_plugin_name}.
170 | Neo4j will continue to start, but \"${_plugin_name}\" will not be loaded."
171 | else
172 | echo "Installing Plugin '${_plugin_name}' from ${_plugin_jar_url} to ${_destination} "
173 | wget -q --timeout 300 --tries 30 --output-document="${_destination}" "${_plugin_jar_url}"
174 |
175 | if ! is_readable "${_destination}"; then
176 | echo >&2 "Plugin at '${_destination}' is not readable"
177 | exit 1
178 | fi
179 | fi
180 | }
181 |
182 | function apply_plugin_default_configuration
183 | {
184 | # Set the correct Load a plugin at runtime. The provided github repository must have a versions.json on the master branch with the
185 | # correct format.
186 | local _plugin_name="${1}" #e.g. apoc, graph-algorithms, graphql
187 | local _reference_conf="${2}" # used to determine if we can override properties
188 | local _neo4j_conf="${NEO4J_HOME}/conf/neo4j.conf"
189 |
190 | local _property _value
191 | echo "Applying default values for plugin ${_plugin_name} to neo4j.conf"
192 | for _entry in $(jq --compact-output --raw-output "with_entries( select(.key==\"${_plugin_name}\") ) | to_entries[] | .value.properties | to_entries[]" /startup/neo4j-plugins.json); do
193 | _property="$(jq --raw-output '.key' <<< "${_entry}")"
194 | _value="$(jq --raw-output '.value' <<< "${_entry}")"
195 | debug_msg "${_plugin_name} requires setting ${_property}=${_value}"
196 |
197 | # the first grep strips out comments
198 | if grep -o "^[^#]*" "${_reference_conf}" | grep -q --fixed-strings "${_property}=" ; then
199 | # property is already set in the user provided config. In this case we don't override what has been set explicitly by the user.
200 | echo "Skipping ${_property} for plugin ${_plugin_name} because it is already set."
201 | echo "You may need to add ${_value} to the ${_property} setting in your configuration file."
202 | else
203 | if grep -o "^[^#]*" "${_neo4j_conf}" | grep -q --fixed-strings "${_property}=" ; then
204 | sed --in-place "s/${_property}=/&${_value},/" "${_neo4j_conf}"
205 | debug_msg "${_property} was already in the configuration file, so ${_value} was added to it."
206 | else
207 | echo -e "\n${_property}=${_value}" >> "${_neo4j_conf}"
208 | debug_msg "${_property}=${_value} has been added to the configuration file."
209 | fi
210 | fi
211 | done
212 | }
213 |
214 | function install_neo4j_labs_plugins
215 | {
216 | # first verify that the requested plugins are valid.
217 | debug_msg "One or more NEO4J_PLUGINS have been requested."
218 | local _known_plugins=($(jq --raw-output "keys[]" /startup/neo4j-plugins.json))
219 | debug_msg "Checking requested plugins are known and can be installed."
220 | for plugin_name in $(echo "${NEO4J_PLUGINS}" | jq --raw-output '.[]'); do
221 | if ! containsElement "${plugin_name}" "${_known_plugins[@]}"; then
222 | printf >&2 "\"%s\" is not a known Neo4j plugin. Options are:\n%s" "${plugin_name}" "$(jq --raw-output "keys[1:][]" /startup/neo4j-plugins.json)"
223 | exit 1
224 | fi
225 | done
226 |
227 | # We store a copy of the config before we modify it for the plugins to allow us to see if there are user-set values in the input config that we shouldn't override
228 | local _old_config="$(mktemp)"
229 | if [ -e "${NEO4J_HOME}"/conf/neo4j.conf ]; then
230 | cp "${NEO4J_HOME}"/conf/neo4j.conf "${_old_config}"
231 | else
232 | touch "${NEO4J_HOME}"/conf/neo4j.conf
233 | touch "${_old_config}"
234 | fi
235 | for plugin_name in $(echo "${NEO4J_PLUGINS}" | jq --raw-output '.[]'); do
236 | debug_msg "Plugin ${plugin_name} has been requested"
237 | local _location="$(jq --raw-output "with_entries( select(.key==\"${plugin_name}\") ) | to_entries[] | .value.location" /startup/neo4j-plugins.json )"
238 | if [ "${_location}" != "null" -a -n "$(shopt -s nullglob; echo ${_location})" ]; then
239 | debug_msg "$plugin_name is already in the container at ${_location}"
240 | load_plugin_from_location "${plugin_name}" "${_location}"
241 | debug_msg "Applying plugin specific configurations."
242 | apply_plugin_default_configuration "${plugin_name}" "${_old_config}"
243 | else
244 | debug_msg "$plugin_name must be downloaded."
245 | if load_plugin_from_url "${plugin_name}"; then
246 | debug_msg "Applying plugin specific configurations."
247 | apply_plugin_default_configuration "${plugin_name}" "${_old_config}"
248 | fi
249 | fi
250 | done
251 | rm "${_old_config}"
252 | }
253 |
254 | function add_docker_default_to_conf
255 | {
256 | # docker defaults should NOT overwrite values already in the conf file
257 | local _setting="${1}"
258 | local _value="${2}"
259 |
260 | if ! grep -q "^${_setting}=" "${NEO4J_HOME}"/conf/neo4j.conf
261 | then
262 | debug_msg "Appended ${_setting}=${_value} to ${NEO4J_HOME}/conf/neo4j.conf"
263 | echo -e "\n"${_setting}=${_value} >> "${NEO4J_HOME}"/conf/neo4j.conf
264 | fi
265 | }
266 |
267 | function add_env_setting_to_conf
268 | {
269 | # settings from environment variables should overwrite values already in the conf
270 | local _setting=${1}
271 | local _value=${2}
272 | local _append_not_replace_configs=("dbms.jvm.additional")
273 |
274 | if grep -q -F "${_setting}=" "${NEO4J_HOME}"/conf/neo4j.conf; then
275 | if containsElement "${_setting}" "${_append_not_replace_configs[@]}"; then
276 | debug_msg "${_setting} will be appended to neo4j.conf without replacing existing settings."
277 | else
278 | # Remove any lines containing the setting already
279 | debug_msg "Removing existing setting for ${_setting}"
280 | sed --in-place "/^${_setting}=.*/d" "${NEO4J_HOME}"/conf/neo4j.conf
281 | fi
282 | fi
283 | # Then always append setting to file
284 | debug_msg "Appended ${_setting}=${_value} to ${NEO4J_HOME}/conf/neo4j.conf"
285 | echo "${_setting}=${_value}" >> "${NEO4J_HOME}"/conf/neo4j.conf
286 | }
287 |
288 | function set_initial_password
289 | {
290 | local _neo4j_auth="${1}"
291 |
292 | # set the neo4j initial password only if you run the database server
293 | if [ "${cmd}" == "neo4j" ]; then
294 | if [ "${_neo4j_auth:-}" == "none" ]; then
295 | debug_msg "Authentication is requested to be unset"
296 | add_env_setting_to_conf "dbms.security.auth_enabled" "false"
297 | elif [[ "${_neo4j_auth:-}" =~ ^([^/]+)\/([^/]+)/?([tT][rR][uU][eE])?$ ]]; then
298 | admin_user="${BASH_REMATCH[1]}"
299 | password="${BASH_REMATCH[2]}"
300 | do_reset="${BASH_REMATCH[3]}"
301 |
302 | if [ "${password}" == "neo4j" ]; then
303 | echo >&2 "Invalid value for password. It cannot be 'neo4j', which is the default."
304 | exit 1
305 | fi
306 | if [ "${admin_user}" != "neo4j" ]; then
307 | echo >&2 "Invalid admin username, it must be neo4j."
308 | exit 1
309 | fi
310 |
311 | if running_as_root; then
312 | # running set-initial-password as root will create subfolders to /data as root, causing startup fail when neo4j can't read or write the /data/dbms folder
313 | # creating the folder first will avoid that
314 | mkdir -p /data/dbms
315 | debug_msg "Making sure /data/dbms is owned by ${userid}:${groupid}"
316 | chown "${userid}":"${groupid}" /data/dbms
317 | fi
318 |
319 | local extra_args=()
320 | if [ "${do_reset}" == "true" ]; then
321 | extra_args+=("--require-password-change")
322 | fi
323 | if [ "${EXTENDED_CONF+"yes"}" == "yes" ]; then
324 | extra_args+=("--expand-commands")
325 | fi
326 | if debugging_enabled; then
327 | extra_args+=("--verbose")
328 | fi
329 | debug_msg "Setting initial password"
330 | debug_msg "${neo4j_admin_cmd} set-initial-password ***** ${extra_args[*]}"
331 | if debugging_enabled; then
332 | # don't suppress any output or errors in debugging mode
333 | ${neo4j_admin_cmd} set-initial-password "${password}" "${extra_args[@]}"
334 | else
335 | # Will exit with error if users already exist (and print a message explaining that)
336 | # we probably don't want the message though, since it throws an error message on restarting the container.
337 | ${neo4j_admin_cmd} set-initial-password "${password}" "${extra_args[@]}" 2>/dev/null || true
338 | fi
339 |
340 | elif [ -n "${_neo4j_auth:-}" ]; then
341 | echo "$_neo4j_auth is invalid"
342 | echo >&2 "Invalid value for NEO4J_AUTH: '${_neo4j_auth}'"
343 | exit 1
344 | fi
345 | fi
346 | }
347 |
348 | # ==== CODE STARTS ====
349 | debug_msg "DEBUGGING ENABLED"
350 |
351 | # If we're running as root, then run as the neo4j user. Otherwise
352 | # docker is running with --user and we simply use that user. Note
353 | # that su-exec, despite its name, does not replicate the functionality
354 | # of exec, so we need to use both
355 | if running_as_root; then
356 | userid="neo4j"
357 | groupid="neo4j"
358 | groups=($(id -G neo4j))
359 | exec_cmd="exec su-exec neo4j:neo4j"
360 | neo4j_admin_cmd="su-exec neo4j:neo4j neo4j-admin"
361 | debug_msg "Running as root user inside neo4j image"
362 | else
363 | userid="$(id -u)"
364 | groupid="$(id -g)"
365 | groups=($(id -G))
366 | exec_cmd="exec"
367 | neo4j_admin_cmd="neo4j-admin"
368 | debug_msg "Running as user ${userid}:${groupid} inside neo4j image"
369 | fi
370 | readonly userid
371 | readonly groupid
372 | readonly groups
373 | readonly exec_cmd
374 | readonly neo4j_admin_cmd
375 |
376 | # Need to chown the home directory
377 | if running_as_root; then
378 | debug_msg "chowning ${NEO4J_HOME} recursively to ${userid}":"${groupid}"
379 | chown -R "${userid}":"${groupid}" "${NEO4J_HOME}"
380 | chmod 700 "${NEO4J_HOME}"
381 | find "${NEO4J_HOME}" -mindepth 1 -maxdepth 1 -type d -exec chmod -R 700 {} \;
382 | debug_msg "Setting all files in ${NEO4J_HOME}/conf to permissions 600"
383 | find "${NEO4J_HOME}"/conf -type f -exec chmod -R 600 {} \;
384 | fi
385 |
386 | ## == EXTRACT SECRETS FROM FILES ===
387 | # These environment variables are set by using docker secrets and they override their equivalent env vars
388 | # They are suffixed with _FILE and prefixed by the name of the env var they should override
389 | # e.g. NEO4J_AUTH_FILE will override the value of the NEO4J_AUTH
390 | # It's best to do this first so that the secrets are available for the rest of the script
391 | for variable_name in $(printenv | awk -F= '{print $1}'); do
392 | # Check if the variable ends with "_FILE" and starts with "NEO4J_"
393 | if [[ $variable_name == *"_FILE" &&
394 | $variable_name == "NEO4J_"* ]]; then
395 | # Create a new variable name by removing the "_FILE" suffix
396 | base_variable_name=${variable_name%_FILE}
397 |
398 | # Get the value of the _FILE variable
399 | secret_file_path="${!variable_name}"
400 |
401 | if is_readable "${secret_file_path}"; then
402 | # Read the secret value from the file
403 | secret_value=$(<"$secret_file_path")
404 | else
405 | # File not readable
406 | echo >&2 "The secret file '$secret_file_path' does not exist or is not readable. Make sure you have correctly configured docker secrets."
407 | exit 1
408 | fi
409 | # Assign the value to the new variable
410 | export "$base_variable_name"="$secret_value"
411 | fi
412 | done
413 |
414 | # ==== CHECK LICENSE AGREEMENT ====
415 |
416 | # Only prompt for license agreement if command contains "neo4j" in it
417 | if [[ "${cmd}" == *"neo4j"* ]]; then
418 | if [ "${NEO4J_EDITION}" == "enterprise" ]; then
419 | if [ "${NEO4J_ACCEPT_LICENSE_AGREEMENT:=no}" != "yes" ]; then
420 | echo >&2 "
421 | In order to use Neo4j Enterprise Edition you must accept the license agreement.
422 |
423 | The license agreement is available at https://neo4j.com/terms/licensing/
424 | If you have a support contract the following terms apply https://neo4j.com/terms/support-terms/
425 |
426 | (c) Neo4j Sweden AB. All Rights Reserved.
427 | Use of this Software without a proper commercial license
428 | with Neo4j, Inc. or its affiliates is prohibited.
429 | Neo4j has the right to terminate your usage if you are not compliant.
430 |
431 | More information is also available at: https://neo4j.com/licensing/
432 | If you have further inquiries about licensing, please contact us via https://neo4j.com/contact-us/
433 |
434 | To accept the commercial license agreement set the environment variable
435 | NEO4J_ACCEPT_LICENSE_AGREEMENT=yes
436 |
437 | To do this you can use the following docker argument:
438 |
439 | --env=NEO4J_ACCEPT_LICENSE_AGREEMENT=yes
440 | "
441 | exit 1
442 | fi
443 | fi
444 | fi
445 |
446 | # NEO4JLABS_PLUGINS is renamed to NEO4J_PLUGINS in 5.x, but we want the new name to work against 4.4 images too
447 | if [ -n "${NEO4JLABS_PLUGINS:-}" ];
448 | then
449 | : ${NEO4J_PLUGINS:=${NEO4JLABS_PLUGINS:-}}
450 | fi
451 |
452 | # ==== RENAME LEGACY ENVIRONMENT CONF VARIABLES ====
453 |
454 | # Env variable naming convention:
455 | # - prefix NEO4J_
456 | # - double underscore char '__' instead of single underscore '_' char in the setting name
457 | # - underscore char '_' instead of dot '.' char in the setting name
458 | # Example:
459 | # NEO4J_dbms_tx__log_rotation_retention__policy env variable to set
460 | # dbms.tx_log.rotation.retention_policy setting
461 |
462 | # Backward compatibility - map old hardcoded env variables into new naming convention (if they aren't set already)
463 | # Set some to default values if unset
464 | : ${NEO4J_dbms_tx__log_rotation_retention__policy:=${NEO4J_dbms_txLog_rotation_retentionPolicy:-}}
465 | : ${NEO4J_dbms_unmanaged__extension__classes:=${NEO4J_dbms_unmanagedExtensionClasses:-}}
466 | : ${NEO4J_dbms_allow__format__migration:=${NEO4J_dbms_allowFormatMigration:-}}
467 | : ${NEO4J_dbms_connectors_default__advertised__address:=${NEO4J_dbms_connectors_defaultAdvertisedAddress:-}}
468 |
469 | if [ "${NEO4J_EDITION}" == "enterprise" ];
470 | then
471 | : ${NEO4J_causal__clustering_expected__core__cluster__size:=${NEO4J_causalClustering_expectedCoreClusterSize:-}}
472 | : ${NEO4J_causal__clustering_initial__discovery__members:=${NEO4J_causalClustering_initialDiscoveryMembers:-}}
473 | debug_msg "Copying contents of /conf to ${NEO4J_HOME}/conf/*"
474 | : ${NEO4J_causal__clustering_discovery__advertised__address:=${NEO4J_causalClustering_discoveryAdvertisedAddress:-}}
475 | : ${NEO4J_causal__clustering_transaction__advertised__address:=${NEO4J_causalClustering_transactionAdvertisedAddress:-}}
476 | : ${NEO4J_causal__clustering_raft__advertised__address:=${NEO4J_causalClustering_raftAdvertisedAddress:-}}
477 | fi
478 |
479 | # unset old hardcoded unsupported env variables
480 | unset NEO4J_dbms_txLog_rotation_retentionPolicy NEO4J_UDC_SOURCE \
481 | NEO4J_dbms_unmanagedExtensionClasses NEO4J_dbms_allowFormatMigration \
482 | NEO4J_dbms_connectors_defaultAdvertisedAddress NEO4J_ha_serverId \
483 | NEO4J_ha_initialHosts NEO4J_causalClustering_expectedCoreClusterSize \
484 | NEO4J_causalClustering_initialDiscoveryMembers \
485 | NEO4J_causalClustering_discoveryListenAddress \
486 | NEO4J_causalClustering_discoveryAdvertisedAddress \
487 | NEO4J_causalClustering_transactionListenAddress \
488 | NEO4J_causalClustering_transactionAdvertisedAddress \
489 | NEO4J_causalClustering_raftListenAddress \
490 | NEO4J_causalClustering_raftAdvertisedAddress
491 |
492 | # ==== CHECK FILE PERMISSIONS ON MOUNTED FOLDERS ====
493 |
494 |
495 | if [ -d /conf ]; then
496 | check_mounted_folder_readable "/conf"
497 | rm -rf "${NEO4J_HOME}"/conf/*
498 | debug_msg "Copying contents of /conf to ${NEO4J_HOME}/conf/*"
499 | find /conf -type f -exec cp --preserve=ownership,mode {} "${NEO4J_HOME}"/conf \;
500 | fi
501 |
502 | if [ -d /ssl ]; then
503 | check_mounted_folder_readable "/ssl"
504 | rm -rf "${NEO4J_HOME}"/certificates
505 | ln -s /ssl "${NEO4J_HOME}"/certificates
506 | fi
507 |
508 | if [ -d /plugins ]; then
509 | if [[ -n "${NEO4J_PLUGINS:-}" ]]; then
510 | # We need write permissions to write the required plugins to /plugins
511 | debug_msg "Extra plugins were requested. Ensuring the mounted /plugins folder has the required write permissions."
512 | check_mounted_folder_writable_with_chown "/plugins"
513 | fi
514 | check_mounted_folder_readable "/plugins"
515 | : ${NEO4J_dbms_directories_plugins:="/plugins"}
516 | fi
517 |
518 | if [ -d /import ]; then
519 | check_mounted_folder_readable "/import"
520 | : ${NEO4J_dbms_directories_import:="/import"}
521 | fi
522 |
523 | if [ -d /metrics ]; then
524 | # metrics is enterprise only
525 | if [ "${NEO4J_EDITION}" == "enterprise" ];
526 | then
527 | check_mounted_folder_writable_with_chown "/metrics"
528 | : ${NEO4J_dbms_directories_metrics:="/metrics"}
529 | fi
530 | fi
531 |
532 | if [ -d /logs ]; then
533 | check_mounted_folder_writable_with_chown "/logs"
534 | : ${NEO4J_dbms_directories_logs:="/logs"}
535 | fi
536 |
537 | if [ -d /data ]; then
538 | check_mounted_folder_writable_with_chown "/data"
539 | if [ -d /data/databases ]; then
540 | check_mounted_folder_writable_with_chown "/data/databases"
541 | fi
542 | if [ -d /data/dbms ]; then
543 | check_mounted_folder_writable_with_chown "/data/dbms"
544 | fi
545 | if [ -d /data/transactions ]; then
546 | check_mounted_folder_writable_with_chown "/data/transactions"
547 | fi
548 | fi
549 |
550 | if [ -d /licenses ]; then
551 | check_mounted_folder_readable "/licenses"
552 | : ${NEO4J_dbms_directories_licenses:="/licenses"}
553 | fi
554 |
555 | # ==== SET CONFIGURATIONS ====
556 |
557 | ## == DOCKER SPECIFIC DEFAULT CONFIGURATIONS ===
558 | ## these should not override *any* configurations set by the user
559 |
560 | debug_msg "Setting docker specific configuration overrides"
561 | add_docker_default_to_conf "dbms.memory.pagecache.size" "512M"
562 | add_docker_default_to_conf "dbms.default_listen_address" "0.0.0.0"
563 |
564 | # set enterprise only docker defaults
565 | if [ "${NEO4J_EDITION}" == "enterprise" ];
566 | then
567 | debug_msg "Setting docker specific Enterprise Edition overrides"
568 | add_docker_default_to_conf "causal_clustering.discovery_advertised_address" "$(hostname):5000"
569 | add_docker_default_to_conf "causal_clustering.transaction_advertised_address" "$(hostname):6000"
570 | add_docker_default_to_conf "causal_clustering.raft_advertised_address" "$(hostname):7000"
571 | add_docker_default_to_conf "dbms.routing.advertised_address" "$(hostname):7688"
572 | fi
573 |
574 | ## == ENVIRONMENT VARIABLE CONFIGURATIONS ===
575 | ## these override BOTH defaults and any existing values in the neo4j.conf file
576 |
577 | # these are docker control envs that have the NEO4J_ prefix but we don't want to add to the config.
578 | not_configs=("NEO4J_ACCEPT_LICENSE_AGREEMENT" "NEO4J_AUTH" "NEO4J_AUTH_PATH" "NEO4J_DEBUG" "NEO4J_EDITION" \
579 | "NEO4J_HOME" "NEO4J_PLUGINS" "NEO4J_SHA256" "NEO4J_TARBALL")
580 |
581 | debug_msg "Applying configuration settings that have been set using environment variables."
582 | # list env variables with prefix NEO4J_ and create settings from them
583 | for i in $( set | grep ^NEO4J_ | awk -F'=' '{print $1}' | sort -rn ); do
584 | if containsElement "$i" "${not_configs[@]}"; then
585 | continue
586 | fi
587 |
588 | # Skip env variables with suffix _FILE, these are docker secrets
589 | if [[ "$i" == *"_FILE" ]]; then
590 | continue
591 | fi
592 |
593 | setting=$(echo "${i}" | sed 's|^NEO4J_||' | sed 's|_|.|g' | sed 's|\.\.|_|g')
594 | value=$(echo "${!i}")
595 | # Don't allow settings with no value or settings that start with a number (neo4j converts settings to env variables and you cannot have an env variable that starts with a number)
596 | if [[ -n ${value} ]]; then
597 | if [[ ! "${setting}" =~ ^[0-9]+.*$ ]]; then
598 | add_env_setting_to_conf "${setting}" "${value}"
599 | else
600 | echo >&2 "WARNING: ${setting} not written to conf file. Settings that start with a number are not permitted."
601 | fi
602 | fi
603 | done
604 |
605 | # ==== SET PASSWORD AND PLUGINS ====
606 |
607 | if [[ -n "${NEO4J_AUTH_PATH:-}" ]]; then
608 | # Validate the existence of the password file
609 | if [ ! -f "${NEO4J_AUTH_PATH}" ]; then
610 | echo >&2 "The password file '${NEO4J_AUTH_PATH}' does not exist"
611 | exit 1
612 | fi
613 | # validate the password file is readable
614 | check_mounted_folder_readable "${NEO4J_AUTH_PATH}"
615 |
616 | debug_msg "Setting initial password from file ${NEO4J_AUTH_PATH}"
617 | set_initial_password "$(cat ${NEO4J_AUTH_PATH})"
618 | else
619 | debug_msg "Setting initial password from environment"
620 | set_initial_password "${NEO4J_AUTH:-}"
621 | fi
622 |
623 |
624 | if [[ ! -z "${NEO4J_PLUGINS:-}" ]]; then
625 | # NEO4J_PLUGINS should be a json array of plugins like '["graph-algorithms", "apoc", "streams", "graphql"]'
626 | install_neo4j_labs_plugins
627 | fi
628 |
629 | # ==== CLEANUP RUN FILE ====
630 |
631 | if [ -f "${NEO4J_HOME}"/run/neo4j.pid ];
632 | then
633 | rm "${NEO4J_HOME}"/run/neo4j.pid
634 | fi
635 |
636 | # ==== INVOKE NEO4J STARTUP ====
637 |
638 | [ -f "${EXTENSION_SCRIPT:-}" ] && . ${EXTENSION_SCRIPT}
639 |
640 | if [ "${cmd}" == "dump-config" ]; then
641 | if [ ! -d "/conf" ]; then
642 | echo >&2 "You must mount a folder to /conf so that the configuration file(s) can be dumped to there."
643 | exit 1
644 | fi
645 | check_mounted_folder_writable_with_chown "/conf"
646 | cp --recursive "${NEO4J_HOME}"/conf/* /conf
647 | echo "Config Dumped"
648 | exit 0
649 | fi
650 |
651 | # this prints out a command for us to run.
652 | # the command is something like: `java ...[lots of java options]... neo4j.mainClass ...[some neo4j options]...`
653 | # putting debug messages here causes the function to break
654 | function get_neo4j_run_cmd {
655 |
656 | local extra_args=()
657 |
658 | if [ "${EXTENDED_CONF+"yes"}" == "yes" ]; then
659 | extra_args+=("--expand-commands")
660 | fi
661 | if debugging_enabled ; then
662 | extra_args+=("--verbose")
663 | fi
664 |
665 | if running_as_root; then
666 | su-exec neo4j:neo4j neo4j console --dry-run "${extra_args[@]}"
667 | else
668 | neo4j console --dry-run "${extra_args[@]}"
669 | fi
670 | }
671 |
672 | # Use su-exec to drop privileges to neo4j user
673 | # Note that su-exec, despite its name, does not replicate the
674 | # functionality of exec, so we need to use both
675 | if [ "${cmd}" == "neo4j" ]; then
676 | # separate declaration and use of get_neo4j_run_cmd so that error codes are correctly surfaced
677 | debug_msg "getting full neo4j run command"
678 | neo4j_console_cmd="$(get_neo4j_run_cmd)"
679 | debug_msg "${exec_cmd} ${neo4j_console_cmd}"
680 | eval ${exec_cmd} ${neo4j_console_cmd?:No Neo4j command was generated}
681 | else
682 | debug_msg "${exec_cmd}" "$@"
683 | ${exec_cmd} "$@"
684 | fi
685 |
```
--------------------------------------------------------------------------------
/neo4j/docker-neo4j/src/test/java/com/neo4j/docker/utils/TemporaryFolderManagerTest.java:
--------------------------------------------------------------------------------
```java
1 | package com.neo4j.docker.utils;
2 |
3 | import org.apache.commons.compress.archivers.ArchiveEntry;
4 | import org.apache.commons.compress.archivers.ArchiveInputStream;
5 | import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
6 | import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
7 | import org.junit.jupiter.api.AfterEach;
8 | import org.junit.jupiter.api.Assertions;
9 | import org.junit.jupiter.api.Assumptions;
10 | import org.junit.jupiter.api.Disabled;
11 | import org.junit.jupiter.api.Order;
12 | import org.junit.jupiter.api.Test;
13 | import org.junit.jupiter.api.extension.RegisterExtension;
14 | import org.junit.jupiter.api.io.TempDir;
15 | import org.junit.jupiter.params.ParameterizedTest;
16 | import org.junit.jupiter.params.provider.CsvSource;
17 | import org.junit.jupiter.params.provider.ValueSource;
18 | import org.testcontainers.containers.GenericContainer;
19 | import org.testcontainers.containers.wait.strategy.Wait;
20 | import org.testcontainers.shaded.org.apache.commons.io.IOUtils;
21 | import org.testcontainers.utility.DockerImageName;
22 |
23 | import java.io.ByteArrayOutputStream;
24 | import java.io.File;
25 | import java.io.FileInputStream;
26 | import java.io.IOException;
27 | import java.nio.file.Files;
28 | import java.nio.file.Path;
29 | import java.time.Duration;
30 | import java.util.ArrayList;
31 | import java.util.List;
32 | import java.util.regex.Pattern;
33 | import java.util.stream.Collectors;
34 |
35 | // This is a test for a test utility. It does not actually test anything to do with the docker image.
36 | // This is disabled unless we're actually trying to develop/fix the TemporaryFolderManager utility.
37 | @Disabled
38 | class TemporaryFolderManagerTest
39 | {
40 | @Order( 0 )
41 | @TempDir
42 | static Path outFolder;
43 |
44 | @Order( 1 )
45 | @RegisterExtension
46 | public TemporaryFolderManager manager = new TemporaryFolderManager(outFolder);
47 |
48 | @AfterEach
49 | void clearAnyCleanupFlags()
50 | {
51 | // some tests may leave folders remaining that are flagged for cleanup, which can affect the
52 | // tests that check that folders are flagged for cleanup. This will reset the flags after each test.
53 | manager.toCompressAfterAll.clear();
54 | }
55 |
56 | // TEST AUTOGENERATES SENSIBLE FOLDER NAME FOR EACH UNIT TEST METHOD
57 |
58 | @Test
59 | void shouldDeriveFolderNameFromTestMethodName()
60 | {
61 | String expectedMethodNameFolderPrefix = this.getClass().getName() + "_shouldDeriveFolderNameFromTestMethodName";
62 | String actualMethodFolderName = manager.methodOutputFolder.getFileName().toString();
63 | // should generate folder with similar/the same name as the method's reference and add 4 random digits to the end
64 | Assertions.assertTrue( actualMethodFolderName.startsWith( expectedMethodNameFolderPrefix ),
65 | "Did not generate correct temporary folder name from unit test method");
66 |
67 | // verify salt is added to foldername like <NAME>_1234
68 | Assertions.assertEquals( expectedMethodNameFolderPrefix.length()+5, actualMethodFolderName.length(),
69 | "Did not correctly add 4 random digits to the expected folder name");
70 | String salt = actualMethodFolderName.substring( expectedMethodNameFolderPrefix.length() + 1 );
71 | Assertions.assertDoesNotThrow( ()->Integer.parseInt( salt ),
72 | "Folder name salt was not digits. Actual: " + actualMethodFolderName );
73 |
74 | // folder should not exist until we call a createTempFolder* method
75 | Assertions.assertFalse( manager.methodOutputFolder.toFile().exists(),
76 | "Unit test method folder was created before requesting any folder creation.");
77 | }
78 |
79 | @ParameterizedTest
80 | @ValueSource(ints = {4,5,6})
81 | void parameterisedTestMethodsCreateDifferentFolders_unnamedInt(int parameter) throws IOException
82 | {
83 | String expectedMethodNameFolderRegex = this.getClass().getName() +
84 | "_parameterisedTestMethodsCreateDifferentFolders_unnamedInt" +
85 | "_\\[" + (parameter-3) + "\\]_" + parameter + "_\\d{4}";
86 | verifyParameterisedFolderNaming(expectedMethodNameFolderRegex);
87 | }
88 |
89 | @ParameterizedTest(name = "name={0}")
90 | @ValueSource(ints = {7,8,9})
91 | void parameterisedTestMethodsCreateDifferentFolders_namedInt(int parameter) throws IOException
92 | {
93 | String expectedMethodNameFolderRegex = this.getClass().getName() +
94 | "_parameterisedTestMethodsCreateDifferentFolders_namedInt_name="
95 | + parameter + "_\\d{4}";
96 | verifyParameterisedFolderNaming(expectedMethodNameFolderRegex);
97 | }
98 |
99 | @ParameterizedTest(name = "name={0}")
100 | @ValueSource(booleans = {true, false})
101 | void parameterisedTestMethodsCreateDifferentFolders_namedBoolean(boolean parameter) throws IOException
102 | {
103 | String expectedMethodNameFolderRegex = this.getClass().getName() +
104 | "_parameterisedTestMethodsCreateDifferentFolders" +
105 | "_namedBoolean_name=" + parameter + "_\\d{4}";
106 | verifyParameterisedFolderNaming(expectedMethodNameFolderRegex);
107 | }
108 |
109 | @ParameterizedTest( name = "bool1={0} bool2={1}" )
110 | @CsvSource({"true,true", "true,false", "false,true", "false,false"})
111 | void parameterisedTestMethodsCreateDifferentFolders_twoNamedBooleans(boolean parameter1, boolean parameter2) throws IOException
112 | {
113 | String expectedMethodNameFolderRegex = this.getClass().getName() +
114 | "_parameterisedTestMethodsCreateDifferentFolders" +
115 | "_twoNamedBooleans_bool1=" + parameter1 +
116 | "_bool2=" + parameter2 + "_\\d{4}";
117 | verifyParameterisedFolderNaming(expectedMethodNameFolderRegex);
118 | }
119 |
120 | private void verifyParameterisedFolderNaming(String expectedMethodNameFolderRegex) throws IOException
121 | {
122 | // get methodFolderName from TemporaryFolderManager
123 | String actualMethodFolderName = manager.methodOutputFolder.getFileName().toString();
124 | Assertions.assertTrue( Pattern.matches( expectedMethodNameFolderRegex, actualMethodFolderName ),
125 | "Folder \"" + actualMethodFolderName +
126 | "\" does not match expected regex \"" + expectedMethodNameFolderRegex + "\"");
127 | // folder should not yet exist
128 | Path expectedUnitTestMethodFolder = outFolder.resolve( manager.methodOutputFolder );
129 | Assertions.assertFalse( expectedUnitTestMethodFolder.toFile().exists(),
130 | "Folder "+expectedUnitTestMethodFolder + " should not have been created" );
131 | // now create folder
132 | manager.createFolder( "somename" );
133 | Assertions.assertTrue( expectedUnitTestMethodFolder.toFile().exists(),
134 | "Folder "+expectedUnitTestMethodFolder + " should have been created" );
135 | }
136 |
137 | @ParameterizedTest
138 | @CsvSource({"/conf,conf", "/data,data", "/import,import", "/logs,logs", "/metrics,metrics", "/plugins,plugins",
139 | "/run/something,run_something", "/place/with space,place_with_space", "/with space,with_space"})
140 | void autoGeneratesSensibleFolderNameFromMountPoint(String mountPoint, String expectedFolderName)
141 | {
142 | Assertions.assertEquals( expectedFolderName, manager.getFolderNameFromMountPoint( mountPoint),
143 | "Did not autogenerate expected name from given mount point");
144 | }
145 |
146 | // TEST ACTUAL FOLDER CREATION AND MOUNTING
147 |
148 | @Test
149 | void shouldMountAnyFolderToContainer(@TempDir Path tempFolder) throws Exception
150 | {
151 | try(GenericContainer container = makeContainer())
152 | {
153 | manager.mountHostFolderAsVolume( container, tempFolder, "/root" );
154 | container.start();
155 | container.execInContainer( "touch", "/root/testout" );
156 | String files = container.execInContainer( "ls", "/root" ).getStdout();
157 | // sanity check that /root/testout actually was created
158 | Assertions.assertTrue( files.contains( "testout" ),
159 | "did not manage to create file inside container in the mounted folder." );
160 | }
161 | Assertions.assertTrue( tempFolder.resolve( "testout" ).toFile().exists(),
162 | "Test file was created in container but not in mounted folder. " +
163 | "Probably it was unsuccessfully mounted" );
164 | }
165 |
166 | @Test
167 | void createsFolder() throws Exception
168 | {
169 | String expectedMethodNameFolderRegex = this.getClass().getName() + "_createsFolder_\\d{4}";
170 | String folderName = "somefolder";
171 | // first verify that no folder exists until we create something
172 | List<Path> allFolders = Files.list( outFolder )
173 | .filter( path -> path.getFileName()
174 | .toString()
175 | .matches( expectedMethodNameFolderRegex ))
176 | .toList();
177 | Assertions.assertEquals( 0, allFolders.size(), "A folder matching " + expectedMethodNameFolderRegex +
178 | " was created when it should not have been");
179 |
180 | // now create a folder
181 | Path p = manager.createFolder( folderName );
182 | // verify folder exists, and is located at outFolder > METHODNAME > somefolder
183 | Path methodNameFolder = verifyMethodNameFolderExistsAndIsUnique( expectedMethodNameFolderRegex );
184 | verifyTempFolder( p, folderName, methodNameFolder );
185 | }
186 |
187 | @Test
188 | void createsFolderUnderGivenParent() throws Exception
189 | {
190 | String expectedMethodNameFolderRegex = this.getClass().getName() + "_createsFolderUnderGivenParent_\\d{4}";
191 | Path unusedFolder = manager.createFolder( "somefolder1" );
192 | Path expectedParent = manager.createFolder( "somefolder2" );
193 | Path p = manager.createFolder( "somefolder3", expectedParent);
194 |
195 | Path methodNameFolder = verifyMethodNameFolderExistsAndIsUnique( expectedMethodNameFolderRegex );
196 | verifyTempFolder( unusedFolder, "somefolder1", methodNameFolder );
197 | verifyTempFolder( expectedParent, "somefolder2", methodNameFolder );
198 | verifyTempFolder( p, "somefolder3", expectedParent );
199 |
200 | // should NOT have created something under unusedFolder
201 | List<Path> f = Files.list( unusedFolder ).toList();
202 | Assertions.assertEquals( 0, f.size(),
203 | "Folder should not have been created under "+unusedFolder );
204 | }
205 |
206 | @Test
207 | void doesNotCreateFolderOutsideRoot(@TempDir Path nonRootFolder)
208 | {
209 | Assertions.assertThrows( IOException.class,
210 | () -> manager.createFolder( "somefolder", nonRootFolder),
211 | "Created a test folder outside the expected area");
212 | }
213 |
214 | @Test
215 | void createNamedFolderAndMount() throws Exception
216 | {
217 | String expectedMethodNameFolderRegex = this.getClass().getName() + "_createNamedFolderAndMount_\\d{4}";
218 | String expectedFolderName = "aFolder";
219 | Path actualTempFolder;
220 | try(GenericContainer container = makeContainer())
221 | {
222 | actualTempFolder = manager.createNamedFolderAndMountAsVolume( container, expectedFolderName, "/root" );
223 | container.start();
224 | container.execInContainer( "touch", "/root/testout" );
225 | String files = container.execInContainer( "ls", "/root" ).getStdout();
226 | // sanity check that /root/testout actually was created
227 | Assertions.assertTrue( files.contains( "testout" ),
228 | "did not manage to create file inside container in the mounted folder." );
229 | }
230 | Path methodFolder = verifyMethodNameFolderExistsAndIsUnique( expectedMethodNameFolderRegex );
231 | Path expectedTempFolder = methodFolder.resolve( expectedFolderName );
232 | verifyTempFolder( expectedTempFolder, expectedFolderName, methodFolder );
233 | Assertions.assertEquals( expectedTempFolder, actualTempFolder,
234 | "Temporary folder was not created in the expected location");
235 | Assertions.assertTrue( expectedTempFolder.resolve( "testout" ).toFile().exists(),
236 | "Test file was created in container but not in mounted folder. " +
237 | "Probably it was unsuccessfully mounted" );
238 | }
239 |
240 | @Test
241 | void createAutomaticallyNamedFolderAndMount() throws Exception
242 | {
243 | String expectedMethodNameFolderRegex = this.getClass().getName() + "_createAutomaticallyNamedFolderAndMount_\\d{4}";
244 | String expectedFolderName = "root";
245 | Path actualTempFolder;
246 | try(GenericContainer container = makeContainer())
247 | {
248 | actualTempFolder = manager.createFolderAndMountAsVolume( container, "/root" );
249 | container.start();
250 | container.execInContainer( "touch", "/root/testout" );
251 | String files = container.execInContainer( "ls", "/root" ).getStdout();
252 | // sanity check that /root/testout actually was created
253 | Assertions.assertTrue( files.contains( "testout" ),
254 | "did not manage to create file inside container in the mounted folder." );
255 | }
256 | Path methodFolder = verifyMethodNameFolderExistsAndIsUnique( expectedMethodNameFolderRegex );
257 | Path expectedTempFolder = methodFolder.resolve( expectedFolderName );
258 | verifyTempFolder( expectedTempFolder, expectedFolderName, methodFolder );
259 | Assertions.assertEquals( expectedTempFolder, actualTempFolder,
260 | "Temporary folder was not created in the expected location");
261 | Assertions.assertTrue( expectedTempFolder.resolve( "testout" ).toFile().exists(),
262 | "Test file was created in container but not in mounted folder. " +
263 | "Probably it was unsuccessfully mounted" );
264 | }
265 |
266 | private Path verifyMethodNameFolderExistsAndIsUnique(String expectedMethodNameFolderRegex) throws Exception
267 | {
268 | // get methodFolderName from TemporaryFolderManager
269 | String actualMethodFolderName = manager.methodOutputFolder.getFileName().toString();
270 | Assertions.assertTrue( Pattern.matches( expectedMethodNameFolderRegex, actualMethodFolderName ),
271 | "Folder \"" + manager.methodOutputFolder +
272 | "\" does not match expected regex \"" + expectedMethodNameFolderRegex + "\"");
273 |
274 | // verify <METHODNAME> folder was created under the root folder store.
275 | List<Path> methodNameFolders = Files.list( outFolder )
276 | .filter( path -> path.getFileName()
277 | .toString()
278 | .matches( expectedMethodNameFolderRegex ) )
279 | .toList();
280 | Assertions.assertEquals( 1, methodNameFolders.size(), "Expected only one folder called " +
281 | expectedMethodNameFolderRegex + ". Actual: " +
282 | methodNameFolders.stream()
283 | .map(Path::toString)
284 | .collect( Collectors.joining( ",")));
285 | Path methodFolder = methodNameFolders.get( 0 ); // previous assertion guarantees this to work
286 | Assertions.assertEquals( methodFolder, manager.methodOutputFolder,
287 | "Folder found in TestTemp is not the same as the one in the folder manager" );
288 | // make sure the <METHODNAME> folder is marked for cleanup
289 | Assertions.assertTrue( manager.toCompressAfterAll.contains( methodFolder ),
290 | "Did not flag " + methodFolder.getFileName() + " for cleanup. Flagged files are: " +
291 | manager.toCompressAfterAll.stream()
292 | .map(Path::toString)
293 | .collect( Collectors.joining( ",")));
294 |
295 | return methodFolder;
296 | }
297 |
298 | private void verifyTempFolder(Path tempFolder, String expectedFolderName, Path expectedParent)
299 | {
300 | Assertions.assertTrue( tempFolder.toFile().exists(), "createTempFolder did not create anything" );
301 | Assertions.assertTrue( tempFolder.toFile().isDirectory(), "Did not create a directory" );
302 | Assertions.assertEquals(expectedFolderName, tempFolder.toFile().getName(),
303 | "Did not give temp directory the expected name" );
304 | Assertions.assertTrue( tempFolder.getParent().equals( expectedParent ),
305 | "Did not create temp folder under expected parent location. Actual: "+tempFolder.getParent() );
306 | }
307 |
308 |
309 | // TEST FOLDER IS CLEANED UP
310 |
311 | private File verifyTarIsCreatedAndUnique(String expectedTarRegex) throws Exception
312 | {
313 | // verify outFolder contains ONE tar matching our regex
314 | List<Path> tarredFiles = Files.list( outFolder )
315 | .filter( path -> path.getFileName()
316 | .toString()
317 | .matches( expectedTarRegex ) )
318 | .toList();
319 | Assertions.assertEquals( 1, tarredFiles.size(), "Expected only one folder called " +
320 | expectedTarRegex + ". Actual: " +
321 | tarredFiles.stream()
322 | .map(Path::toString)
323 | .collect( Collectors.joining( ",")));
324 | return tarredFiles.get( 0 ).toFile();
325 | }
326 |
327 | @Test
328 | void createsTarOfFolder() throws Exception
329 | {
330 | Assumptions.assumeFalse( TestSettings.SKIP_MOUNTED_FOLDER_TARBALLING, "Temporary folder zipping disabled" );
331 | String expectedTarRegex = this.getClass().getName() + "_createsTarOfFolder_\\d{4}\\.tar\\.gz";
332 | String expectedFileContents = "words words words";
333 |
334 | // create one folder with one file to be zipped.
335 | Path tempFolder = manager.createFolder( "tozip" );
336 | Files.writeString( tempFolder.resolve( "testfile" ), expectedFileContents );
337 | Assertions.assertTrue( tempFolder.resolve( "testfile" ).toFile().exists(),
338 | "Test failure. Did not successfully write to "+tempFolder);
339 |
340 | manager.triggerCleanup();
341 |
342 | File actualTar = verifyTarIsCreatedAndUnique( expectedTarRegex );
343 | List<String> files = listFilesInTar( actualTar );
344 | Assertions.assertEquals( 1, files.size(),
345 | "Tar file "+actualTar+" exists but is empty." );
346 | String writtenFile = readFileInTar( actualTar, "/tozip/testfile" );
347 | Assertions.assertEquals( expectedFileContents, writtenFile );
348 | // all temporary folder should now be deleted
349 | Assertions.assertFalse( tempFolder.toFile().exists(), "Temporary folder should have been deleted" );
350 | }
351 |
352 | @Test
353 | void createsTarOfFolder_2Files() throws Exception
354 | {
355 | Assumptions.assumeFalse( TestSettings.SKIP_MOUNTED_FOLDER_TARBALLING, "Temporary folder zipping disabled" );
356 | String expectedTarRegex = this.getClass().getName() + "_createsTarOfFolder_2Files_\\d{4}\\.tar\\.gz";
357 | String expectedFileContents1 = "words1 words1 words1";
358 | String expectedFileContents2 = "words2 words2 words2";
359 |
360 | // create one folder with one file to be zipped.
361 | Path tempFolder = manager.createFolder( "tozip" );
362 | Files.writeString( tempFolder.resolve( "testfile1" ), expectedFileContents1 );
363 | Assertions.assertTrue( tempFolder.resolve( "testfile1" ).toFile().exists(),
364 | "Test failure. Did not successfully write to "+tempFolder);
365 | Files.writeString( tempFolder.resolve( "testfile2" ), expectedFileContents2 );
366 | Assertions.assertTrue( tempFolder.resolve( "testfile2" ).toFile().exists(),
367 | "Test failure. Did not successfully write to "+tempFolder);
368 |
369 | manager.triggerCleanup();
370 |
371 | File actualTar = verifyTarIsCreatedAndUnique( expectedTarRegex );
372 | List<String> files = listFilesInTar( actualTar );
373 | Assertions.assertEquals( 2, files.size(),
374 | "Tar file "+actualTar+" exists but is empty." );
375 | String writtenFile1 = readFileInTar( actualTar, "/tozip/testfile1" );
376 | String writtenFile2 = readFileInTar( actualTar, "/tozip/testfile2" );
377 | Assertions.assertEquals( expectedFileContents1, writtenFile1 );
378 | Assertions.assertEquals( expectedFileContents2, writtenFile2 );
379 | Assertions.assertFalse( tempFolder.toFile().exists(), "Temporary folder should have been deleted" );
380 | }
381 |
382 | @Test
383 | void createsTarOfFolder_2Folders() throws Exception
384 | {
385 | Assumptions.assumeFalse( TestSettings.SKIP_MOUNTED_FOLDER_TARBALLING, "Temporary folder zipping disabled" );
386 | String expectedTarRegex = this.getClass().getName() + "_createsTarOfFolder_2Folders_\\d{4}\\.tar\\.gz";
387 | String expectedFileContents1 = "words1 words1 words1";
388 | String expectedFileContents2 = "words2 words2 words2";
389 |
390 | // create one folder with one file to be zipped.
391 | Path tempFolder1 = manager.createFolder( "tozip1" );
392 | Files.writeString( tempFolder1.resolve( "testfile" ), expectedFileContents1 );
393 | Assertions.assertTrue( tempFolder1.resolve( "testfile" ).toFile().exists(),
394 | "Test failure. Did not successfully write to "+tempFolder1);
395 | Path tempFolder2 = manager.createFolder( "tozip2" );
396 | Files.writeString( tempFolder2.resolve( "testfile" ), expectedFileContents2 );
397 | Assertions.assertTrue( tempFolder2.resolve( "testfile" ).toFile().exists(),
398 | "Test failure. Did not successfully write to "+tempFolder2);
399 |
400 | manager.triggerCleanup();
401 |
402 | File actualTar = verifyTarIsCreatedAndUnique( expectedTarRegex );
403 | List<String> files = listFilesInTar( actualTar );
404 |
405 | Assertions.assertEquals( 2, files.size(),
406 | "Tar file "+actualTar+" exists but does not contain the expected files." );
407 | String writtenFile1 = readFileInTar( actualTar, "/tozip1/testfile" );
408 | Assertions.assertEquals( expectedFileContents1, writtenFile1 );
409 | String writtenFile2 = readFileInTar( actualTar, "/tozip2/testfile" );
410 | Assertions.assertEquals( expectedFileContents2, writtenFile2 );
411 | Assertions.assertFalse( tempFolder1.toFile().exists(), "Temporary folder should have been deleted" );
412 | Assertions.assertFalse( tempFolder2.toFile().exists(), "Temporary folder should have been deleted" );
413 | }
414 |
415 | @Test
416 | void createsTarOfFolder_nestedFolders() throws Exception
417 | {
418 | Assumptions.assumeFalse( TestSettings.SKIP_MOUNTED_FOLDER_TARBALLING, "Temporary folder zipping disabled" );
419 | String expectedTarRegex = this.getClass().getName() + "_createsTarOfFolder_nestedFolders_\\d{4}\\.tar\\.gz";
420 | // creating folders:
421 | // tempFolder1
422 | // | tempFolder2
423 | // | | testfile
424 | String expectedFileContents = "words words words";
425 |
426 | // create one folder with one file to be zipped.
427 | Path tempFolder1 = manager.createFolder( "tozip1" );
428 | Path tempFolder2 = manager.createFolder( "tozip2", tempFolder1 );
429 | Files.writeString( tempFolder2.resolve( "testfile" ), expectedFileContents );
430 | Assertions.assertTrue( tempFolder2.resolve( "testfile" ).toFile().exists(),
431 | "Test failure. Did not successfully write to "+tempFolder2);
432 |
433 | manager.triggerCleanup();
434 | File actualTar = verifyTarIsCreatedAndUnique( expectedTarRegex );
435 |
436 | List<String> files = listFilesInTar( actualTar );
437 | Assertions.assertEquals( 1, files.size(),
438 | "Tar file "+actualTar+" exists but is empty." );
439 | String writtenFile = readFileInTar( actualTar,"/tozip1/tozip2/testfile" );
440 | Assertions.assertEquals( expectedFileContents, writtenFile );
441 | Assertions.assertFalse( tempFolder1.toFile().exists(), "Temporary folder should have been deleted" );
442 | }
443 |
444 | // TEST CODE CLEANUP WITH REOWNING
445 |
446 | @Test
447 | void canSetFolderOwnerTo7474ThenCleanup() throws Exception
448 | {
449 | Assumptions.assumeFalse( TestSettings.SKIP_MOUNTED_FOLDER_TARBALLING, "Temporary folder zipping disabled" );
450 | String expectedTarRegex = this.getClass().getName() + "_canSetFolderOwnerTo7474ThenCleanup_\\d{4}\\.tar\\.gz";
451 | Path tempFolder = manager.createFolder( "tozip" );
452 | Files.writeString ( tempFolder.resolve( "testfile" ), "words" );
453 |
454 | manager.setFolderOwnerToNeo4j( tempFolder );
455 | // verify expected folder owner
456 | Integer fileUID = (Integer) Files.getAttribute( tempFolder, "unix:uid" );
457 | Assertions.assertEquals( 7474, fileUID.intValue(),
458 | "Did not successfully set the owner of "+tempFolder );
459 | // clean up and verify successfully cleaned up
460 | manager.triggerCleanup();
461 | verifyTarIsCreatedAndUnique( expectedTarRegex );
462 | Assertions.assertFalse( tempFolder.toFile().exists(), "Did not successfully delete "+tempFolder );
463 | }
464 |
465 | @Test
466 | void canCreateAndCleanupFoldersWithDifferentOwners() throws Exception
467 | {
468 | Assumptions.assumeFalse( TestSettings.SKIP_MOUNTED_FOLDER_TARBALLING, "Temporary folder zipping disabled" );
469 | String expectedTarRegex = this.getClass().getName() + "_canCreateAndCleanupFoldersWithDifferentOwners_\\d{4}\\.tar\\.gz";
470 | Path tempFolder7474 = manager.createFolder( "tozip7474" );
471 | Path tempFolderNormal = manager.createFolder( "tozipNormal" );
472 | Files.writeString( tempFolder7474.resolve( "testfile" ), "words" );
473 | Files.writeString( tempFolderNormal.resolve( "testfile" ), "words" );
474 |
475 | manager.setFolderOwnerToNeo4j( tempFolder7474 );
476 | Integer fileUID = (Integer) Files.getAttribute( tempFolder7474, "unix:uid" );
477 | Assertions.assertEquals( 7474, fileUID.intValue(),
478 | "Did not successfully set the owner of "+tempFolder7474 );
479 |
480 | manager.triggerCleanup();
481 | verifyTarIsCreatedAndUnique( expectedTarRegex );
482 | Assertions.assertFalse( tempFolderNormal.toFile().exists(), "Did not successfully delete "+tempFolderNormal );
483 | Assertions.assertFalse( tempFolder7474.toFile().exists(), "Did not successfully delete "+tempFolder7474 );
484 | }
485 |
486 | private GenericContainer makeContainer()
487 | {
488 | // we don't want to test the neo4j container, just use a generic container debian to check mounting.
489 | // using nginx here just because there is a straightforward way of waiting for it to be ready
490 | GenericContainer container = new GenericContainer(DockerImageName.parse("nginx:latest"))
491 | .withExposedPorts(80)
492 | .waitingFor(Wait.forHttp("/").withStartupTimeout( Duration.ofSeconds( 5 ) ));
493 | return container;
494 | }
495 |
496 | private List<String> listFilesInTar(File tar) throws IOException
497 | {
498 | List<String> files = new ArrayList<>();
499 | ArchiveInputStream in = new TarArchiveInputStream(
500 | new GzipCompressorInputStream( new FileInputStream(tar) ));
501 | ArchiveEntry entry = in.getNextEntry();
502 | while(entry != null)
503 | {
504 | files.add( entry.getName() );
505 | entry = in.getNextEntry();
506 | }
507 | in.close();
508 | return files;
509 | }
510 |
511 | private String readFileInTar(File tar, String internalFilePath) throws IOException
512 | {
513 | internalFilePath = tar.getName().split( "\\.tar\\.gz" )[0] + internalFilePath;
514 | String fileContents = null;
515 | ArchiveInputStream in = new TarArchiveInputStream(
516 | new GzipCompressorInputStream( new FileInputStream(tar) ));
517 | ArchiveEntry entry = in.getNextEntry();
518 | while(entry != null)
519 | {
520 | if(entry.getName().equals( internalFilePath ))
521 | {
522 | ByteArrayOutputStream outStream = new ByteArrayOutputStream();
523 | IOUtils.copy(in, outStream);
524 | fileContents = outStream.toString();
525 | break;
526 | }
527 | entry = in.getNextEntry();
528 | }
529 | in.close();
530 | Assertions.assertNotNull( fileContents, "Could not extract file "+internalFilePath+" from "+tar);
531 | return fileContents;
532 | }
533 | }
```
--------------------------------------------------------------------------------
/src/utils.py:
--------------------------------------------------------------------------------
```python
1 | """
2 | Utility functions for the Crawl4AI MCP server.
3 | """
4 | import os
5 | import concurrent.futures
6 | from typing import List, Dict, Any, Optional, Tuple
7 | import json
8 | from supabase import create_client, Client
9 | from urllib.parse import urlparse
10 | import openai
11 | import re
12 | import time
13 |
14 | # Load OpenAI API key for embeddings
15 | openai.api_key = os.getenv("OPENAI_API_KEY")
16 |
17 | # Initialize OpenRouter client for chat completions
18 | def get_openrouter_client():
19 | """
20 | Get an OpenRouter-configured OpenAI client for chat completions.
21 |
22 | Returns:
23 | OpenAI client configured for OpenRouter
24 | """
25 | openrouter_api_key = os.getenv("OPENROUTER_API_KEY")
26 | if not openrouter_api_key:
27 | raise ValueError("OPENROUTER_API_KEY must be set in environment variables for chat completions")
28 |
29 | return openai.OpenAI(
30 | base_url="https://openrouter.ai/api/v1",
31 | api_key=openrouter_api_key,
32 | default_headers={
33 | "HTTP-Referer": os.getenv("YOUR_SITE_URL", ""), # Optional for rankings
34 | "X-Title": os.getenv("YOUR_SITE_NAME", "Crawl4AI-MCP"), # Optional for rankings
35 | }
36 | )
37 |
38 | def get_supabase_client() -> Client:
39 | """
40 | Get a Supabase client with the URL and key from environment variables.
41 |
42 | Returns:
43 | Supabase client instance
44 | """
45 | url = os.getenv("SUPABASE_URL")
46 | key = os.getenv("SUPABASE_SERVICE_KEY")
47 |
48 | if not url or not key:
49 | raise ValueError("SUPABASE_URL and SUPABASE_SERVICE_KEY must be set in environment variables")
50 |
51 | return create_client(url, key)
52 |
53 | def create_embeddings_batch(texts: List[str]) -> List[List[float]]:
54 | """
55 | Create embeddings for multiple texts in a single API call.
56 |
57 | Args:
58 | texts: List of texts to create embeddings for
59 |
60 | Returns:
61 | List of embeddings (each embedding is a list of floats)
62 | """
63 | if not texts:
64 | return []
65 |
66 | max_retries = 3
67 | retry_delay = 1.0 # Start with 1 second delay
68 |
69 | for retry in range(max_retries):
70 | try:
71 | response = openai.embeddings.create(
72 | model="text-embedding-3-small", # Hardcoding embedding model for now, will change this later to be more dynamic
73 | input=texts
74 | )
75 | return [item.embedding for item in response.data]
76 | except Exception as e:
77 | if retry < max_retries - 1:
78 | print(f"Error creating batch embeddings (attempt {retry + 1}/{max_retries}): {e}")
79 | print(f"Retrying in {retry_delay} seconds...")
80 | time.sleep(retry_delay)
81 | retry_delay *= 2 # Exponential backoff
82 | else:
83 | print(f"Failed to create batch embeddings after {max_retries} attempts: {e}")
84 | # Try creating embeddings one by one as fallback
85 | print("Attempting to create embeddings individually...")
86 | embeddings = []
87 | successful_count = 0
88 |
89 | for i, text in enumerate(texts):
90 | try:
91 | individual_response = openai.embeddings.create(
92 | model="text-embedding-3-small",
93 | input=[text]
94 | )
95 | embeddings.append(individual_response.data[0].embedding)
96 | successful_count += 1
97 | except Exception as individual_error:
98 | print(f"Failed to create embedding for text {i}: {individual_error}")
99 | # Add zero embedding as fallback
100 | embeddings.append([0.0] * 1536)
101 |
102 | print(f"Successfully created {successful_count}/{len(texts)} embeddings individually")
103 | return embeddings
104 |
105 | def create_embedding(text: str) -> List[float]:
106 | """
107 | Create an embedding for a single text using OpenAI's API.
108 |
109 | Args:
110 | text: Text to create an embedding for
111 |
112 | Returns:
113 | List of floats representing the embedding
114 | """
115 | try:
116 | embeddings = create_embeddings_batch([text])
117 | return embeddings[0] if embeddings else [0.0] * 1536
118 | except Exception as e:
119 | print(f"Error creating embedding: {e}")
120 | # Return empty embedding if there's an error
121 | return [0.0] * 1536
122 |
123 | def generate_contextual_embedding(full_document: str, chunk: str) -> Tuple[str, bool]:
124 | """
125 | Generate contextual information for a chunk within a document to improve retrieval.
126 |
127 | Args:
128 | full_document: The complete document text
129 | chunk: The specific chunk of text to generate context for
130 |
131 | Returns:
132 | Tuple containing:
133 | - The contextual text that situates the chunk within the document
134 | - Boolean indicating if contextual embedding was performed
135 | """
136 | model_choice = os.getenv("MODEL_CHOICE", "openai/gpt-4.1-nano")
137 |
138 | try:
139 | # Get OpenRouter client for chat completions
140 | openrouter_client = get_openrouter_client()
141 |
142 | # Create the prompt for generating contextual information
143 | prompt = f"""<document>
144 | {full_document[:25000]}
145 | </document>
146 | Here is the chunk we want to situate within the whole document
147 | <chunk>
148 | {chunk}
149 | </chunk>
150 | Please give a short succinct context to situate this chunk within the overall document for the purposes of improving search retrieval of the chunk. Answer only with the succinct context and nothing else."""
151 |
152 | # Call the OpenRouter API to generate contextual information
153 | response = openrouter_client.chat.completions.create(
154 | model=model_choice,
155 | messages=[
156 | {"role": "system", "content": "You are a helpful assistant that provides concise contextual information."},
157 | {"role": "user", "content": prompt}
158 | ],
159 | temperature=0.3,
160 | max_tokens=200
161 | )
162 |
163 | # Extract the generated context
164 | context = response.choices[0].message.content.strip()
165 |
166 | # Combine the context with the original chunk
167 | contextual_text = f"{context}\n---\n{chunk}"
168 |
169 | return contextual_text, True
170 |
171 | except Exception as e:
172 | print(f"Error generating contextual embedding: {e}. Using original chunk instead.")
173 | return chunk, False
174 |
175 | def process_chunk_with_context(args):
176 | """
177 | Process a single chunk with contextual embedding.
178 | This function is designed to be used with concurrent.futures.
179 |
180 | Args:
181 | args: Tuple containing (url, content, full_document)
182 |
183 | Returns:
184 | Tuple containing:
185 | - The contextual text that situates the chunk within the document
186 | - Boolean indicating if contextual embedding was performed
187 | """
188 | url, content, full_document = args
189 | return generate_contextual_embedding(full_document, content)
190 |
191 | def add_documents_to_supabase(
192 | client: Client,
193 | urls: List[str],
194 | chunk_numbers: List[int],
195 | contents: List[str],
196 | metadatas: List[Dict[str, Any]],
197 | url_to_full_document: Dict[str, str],
198 | batch_size: int = 20
199 | ) -> None:
200 | """
201 | Add documents to the Supabase crawled_pages table in batches.
202 | Deletes existing records with the same URLs before inserting to prevent duplicates.
203 |
204 | Args:
205 | client: Supabase client
206 | urls: List of URLs
207 | chunk_numbers: List of chunk numbers
208 | contents: List of document contents
209 | metadatas: List of document metadata
210 | url_to_full_document: Dictionary mapping URLs to their full document content
211 | batch_size: Size of each batch for insertion
212 | """
213 | # Get unique URLs to delete existing records
214 | unique_urls = list(set(urls))
215 |
216 | # Delete existing records for these URLs in a single operation
217 | try:
218 | if unique_urls:
219 | # Use the .in_() filter to delete all records with matching URLs
220 | client.table("crawled_pages").delete().in_("url", unique_urls).execute()
221 | except Exception as e:
222 | print(f"Batch delete failed: {e}. Trying one-by-one deletion as fallback.")
223 | # Fallback: delete records one by one
224 | for url in unique_urls:
225 | try:
226 | client.table("crawled_pages").delete().eq("url", url).execute()
227 | except Exception as inner_e:
228 | print(f"Error deleting record for URL {url}: {inner_e}")
229 | # Continue with the next URL even if one fails
230 |
231 | # Check if MODEL_CHOICE is set for contextual embeddings
232 | use_contextual_embeddings = os.getenv("USE_CONTEXTUAL_EMBEDDINGS", "false") == "true"
233 | print(f"\n\nUse contextual embeddings: {use_contextual_embeddings}\n\n")
234 |
235 | # Process in batches to avoid memory issues
236 | for i in range(0, len(contents), batch_size):
237 | batch_end = min(i + batch_size, len(contents))
238 |
239 | # Get batch slices
240 | batch_urls = urls[i:batch_end]
241 | batch_chunk_numbers = chunk_numbers[i:batch_end]
242 | batch_contents = contents[i:batch_end]
243 | batch_metadatas = metadatas[i:batch_end]
244 |
245 | # Apply contextual embedding to each chunk if MODEL_CHOICE is set
246 | if use_contextual_embeddings:
247 | # Prepare arguments for parallel processing
248 | process_args = []
249 | for j, content in enumerate(batch_contents):
250 | url = batch_urls[j]
251 | full_document = url_to_full_document.get(url, "")
252 | process_args.append((url, content, full_document))
253 |
254 | # Process in parallel using ThreadPoolExecutor
255 | contextual_contents = []
256 | with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
257 | # Submit all tasks and collect results
258 | future_to_idx = {executor.submit(process_chunk_with_context, arg): idx
259 | for idx, arg in enumerate(process_args)}
260 |
261 | # Process results as they complete
262 | for future in concurrent.futures.as_completed(future_to_idx):
263 | idx = future_to_idx[future]
264 | try:
265 | result, success = future.result()
266 | contextual_contents.append(result)
267 | if success:
268 | batch_metadatas[idx]["contextual_embedding"] = True
269 | except Exception as e:
270 | print(f"Error processing chunk {idx}: {e}")
271 | # Use original content as fallback
272 | contextual_contents.append(batch_contents[idx])
273 |
274 | # Sort results back into original order if needed
275 | if len(contextual_contents) != len(batch_contents):
276 | print(f"Warning: Expected {len(batch_contents)} results but got {len(contextual_contents)}")
277 | # Use original contents as fallback
278 | contextual_contents = batch_contents
279 | else:
280 | # If not using contextual embeddings, use original contents
281 | contextual_contents = batch_contents
282 |
283 | # Create embeddings for the entire batch at once
284 | batch_embeddings = create_embeddings_batch(contextual_contents)
285 |
286 | batch_data = []
287 | for j in range(len(contextual_contents)):
288 | # Extract metadata fields
289 | chunk_size = len(contextual_contents[j])
290 |
291 | # Extract source_id from URL
292 | parsed_url = urlparse(batch_urls[j])
293 | source_id = parsed_url.netloc or parsed_url.path
294 |
295 | # Prepare data for insertion
296 | data = {
297 | "url": batch_urls[j],
298 | "chunk_number": batch_chunk_numbers[j],
299 | "content": contextual_contents[j], # Store original content
300 | "metadata": {
301 | "chunk_size": chunk_size,
302 | **batch_metadatas[j]
303 | },
304 | "source_id": source_id, # Add source_id field
305 | "embedding": batch_embeddings[j] # Use embedding from contextual content
306 | }
307 |
308 | batch_data.append(data)
309 |
310 | # Insert batch into Supabase with retry logic
311 | max_retries = 3
312 | retry_delay = 1.0 # Start with 1 second delay
313 |
314 | for retry in range(max_retries):
315 | try:
316 | client.table("crawled_pages").insert(batch_data).execute()
317 | # Success - break out of retry loop
318 | break
319 | except Exception as e:
320 | if retry < max_retries - 1:
321 | print(f"Error inserting batch into Supabase (attempt {retry + 1}/{max_retries}): {e}")
322 | print(f"Retrying in {retry_delay} seconds...")
323 | time.sleep(retry_delay)
324 | retry_delay *= 2 # Exponential backoff
325 | else:
326 | # Final attempt failed
327 | print(f"Failed to insert batch after {max_retries} attempts: {e}")
328 | # Optionally, try inserting records one by one as a last resort
329 | print("Attempting to insert records individually...")
330 | successful_inserts = 0
331 | for record in batch_data:
332 | try:
333 | client.table("crawled_pages").insert(record).execute()
334 | successful_inserts += 1
335 | except Exception as individual_error:
336 | print(f"Failed to insert individual record for URL {record['url']}: {individual_error}")
337 |
338 | if successful_inserts > 0:
339 | print(f"Successfully inserted {successful_inserts}/{len(batch_data)} records individually")
340 |
341 | def search_documents(
342 | client: Client,
343 | query: str,
344 | match_count: int = 10,
345 | filter_metadata: Optional[Dict[str, Any]] = None
346 | ) -> List[Dict[str, Any]]:
347 | """
348 | Search for documents in Supabase using vector similarity.
349 |
350 | Args:
351 | client: Supabase client
352 | query: Query text
353 | match_count: Maximum number of results to return
354 | filter_metadata: Optional metadata filter
355 |
356 | Returns:
357 | List of matching documents
358 | """
359 | # Create embedding for the query
360 | query_embedding = create_embedding(query)
361 |
362 | # Execute the search using the match_crawled_pages function
363 | try:
364 | # Only include filter parameter if filter_metadata is provided and not empty
365 | params = {
366 | 'query_embedding': query_embedding,
367 | 'match_count': match_count
368 | }
369 |
370 | # Only add the filter if it's actually provided and not empty
371 | if filter_metadata:
372 | params['filter'] = filter_metadata # Pass the dictionary directly, not JSON-encoded
373 |
374 | result = client.rpc('match_crawled_pages', params).execute()
375 |
376 | return result.data
377 | except Exception as e:
378 | print(f"Error searching documents: {e}")
379 | return []
380 |
381 |
382 | def extract_code_blocks(markdown_content: str, min_length: int = 1000) -> List[Dict[str, Any]]:
383 | """
384 | Extract code blocks from markdown content along with context.
385 |
386 | Args:
387 | markdown_content: The markdown content to extract code blocks from
388 | min_length: Minimum length of code blocks to extract (default: 1000 characters)
389 |
390 | Returns:
391 | List of dictionaries containing code blocks and their context
392 | """
393 | code_blocks = []
394 |
395 | # Skip if content starts with triple backticks (edge case for files wrapped in backticks)
396 | content = markdown_content.strip()
397 | start_offset = 0
398 | if content.startswith('```'):
399 | # Skip the first triple backticks
400 | start_offset = 3
401 | print("Skipping initial triple backticks")
402 |
403 | # Find all occurrences of triple backticks
404 | backtick_positions = []
405 | pos = start_offset
406 | while True:
407 | pos = markdown_content.find('```', pos)
408 | if pos == -1:
409 | break
410 | backtick_positions.append(pos)
411 | pos += 3
412 |
413 | # Process pairs of backticks
414 | i = 0
415 | while i < len(backtick_positions) - 1:
416 | start_pos = backtick_positions[i]
417 | end_pos = backtick_positions[i + 1]
418 |
419 | # Extract the content between backticks
420 | code_section = markdown_content[start_pos+3:end_pos]
421 |
422 | # Check if there's a language specifier on the first line
423 | lines = code_section.split('\n', 1)
424 | if len(lines) > 1:
425 | # Check if first line is a language specifier (no spaces, common language names)
426 | first_line = lines[0].strip()
427 | if first_line and not ' ' in first_line and len(first_line) < 20:
428 | language = first_line
429 | code_content = lines[1].strip() if len(lines) > 1 else ""
430 | else:
431 | language = ""
432 | code_content = code_section.strip()
433 | else:
434 | language = ""
435 | code_content = code_section.strip()
436 |
437 | # Skip if code block is too short
438 | if len(code_content) < min_length:
439 | i += 2 # Move to next pair
440 | continue
441 |
442 | # Extract context before (1000 chars)
443 | context_start = max(0, start_pos - 1000)
444 | context_before = markdown_content[context_start:start_pos].strip()
445 |
446 | # Extract context after (1000 chars)
447 | context_end = min(len(markdown_content), end_pos + 3 + 1000)
448 | context_after = markdown_content[end_pos + 3:context_end].strip()
449 |
450 | code_blocks.append({
451 | 'code': code_content,
452 | 'language': language,
453 | 'context_before': context_before,
454 | 'context_after': context_after,
455 | 'full_context': f"{context_before}\n\n{code_content}\n\n{context_after}"
456 | })
457 |
458 | # Move to next pair (skip the closing backtick we just processed)
459 | i += 2
460 |
461 | return code_blocks
462 |
463 |
464 | def generate_code_example_summary(code: str, context_before: str, context_after: str) -> str:
465 | """
466 | Generate a summary for a code example using its surrounding context.
467 |
468 | Args:
469 | code: The code example
470 | context_before: Context before the code
471 | context_after: Context after the code
472 |
473 | Returns:
474 | A summary of what the code example demonstrates
475 | """
476 | model_choice = os.getenv("MODEL_CHOICE", "openai/gpt-4.1-nano")
477 |
478 | # Create the prompt
479 | prompt = f"""<context_before>
480 | {context_before[-500:] if len(context_before) > 500 else context_before}
481 | </context_before>
482 |
483 | <code_example>
484 | {code[:1500] if len(code) > 1500 else code}
485 | </code_example>
486 |
487 | <context_after>
488 | {context_after[:500] if len(context_after) > 500 else context_after}
489 | </context_after>
490 |
491 | Based on the code example and its surrounding context, provide a concise summary (2-3 sentences) that describes what this code example demonstrates and its purpose. Focus on the practical application and key concepts illustrated.
492 | """
493 |
494 | try:
495 | # Get OpenRouter client for chat completions
496 | openrouter_client = get_openrouter_client()
497 |
498 | response = openrouter_client.chat.completions.create(
499 | model=model_choice,
500 | messages=[
501 | {"role": "system", "content": "You are a helpful assistant that provides concise code example summaries."},
502 | {"role": "user", "content": prompt}
503 | ],
504 | temperature=0.3,
505 | max_tokens=300
506 | )
507 |
508 | return response.choices[0].message.content.strip()
509 |
510 | except Exception as e:
511 | print(f"Error generating code example summary: {e}")
512 | return "Code example for demonstration purposes."
513 |
514 |
515 | def add_code_examples_to_supabase(
516 | client: Client,
517 | urls: List[str],
518 | chunk_numbers: List[int],
519 | code_examples: List[str],
520 | summaries: List[str],
521 | metadatas: List[Dict[str, Any]],
522 | batch_size: int = 20
523 | ):
524 | """
525 | Add code examples to the Supabase code_examples table in batches.
526 |
527 | Args:
528 | client: Supabase client
529 | urls: List of URLs
530 | chunk_numbers: List of chunk numbers
531 | code_examples: List of code example contents
532 | summaries: List of code example summaries
533 | metadatas: List of metadata dictionaries
534 | batch_size: Size of each batch for insertion
535 | """
536 | if not urls:
537 | return
538 |
539 | # Delete existing records for these URLs
540 | unique_urls = list(set(urls))
541 | for url in unique_urls:
542 | try:
543 | client.table('code_examples').delete().eq('url', url).execute()
544 | except Exception as e:
545 | print(f"Error deleting existing code examples for {url}: {e}")
546 |
547 | # Process in batches
548 | total_items = len(urls)
549 | for i in range(0, total_items, batch_size):
550 | batch_end = min(i + batch_size, total_items)
551 | batch_texts = []
552 |
553 | # Create combined texts for embedding (code + summary)
554 | for j in range(i, batch_end):
555 | combined_text = f"{code_examples[j]}\n\nSummary: {summaries[j]}"
556 | batch_texts.append(combined_text)
557 |
558 | # Create embeddings for the batch
559 | embeddings = create_embeddings_batch(batch_texts)
560 |
561 | # Check if embeddings are valid (not all zeros)
562 | valid_embeddings = []
563 | for embedding in embeddings:
564 | if embedding and not all(v == 0.0 for v in embedding):
565 | valid_embeddings.append(embedding)
566 | else:
567 | print(f"Warning: Zero or invalid embedding detected, creating new one...")
568 | # Try to create a single embedding as fallback
569 | single_embedding = create_embedding(batch_texts[len(valid_embeddings)])
570 | valid_embeddings.append(single_embedding)
571 |
572 | # Prepare batch data
573 | batch_data = []
574 | for j, embedding in enumerate(valid_embeddings):
575 | idx = i + j
576 |
577 | # Extract source_id from URL
578 | parsed_url = urlparse(urls[idx])
579 | source_id = parsed_url.netloc or parsed_url.path
580 |
581 | batch_data.append({
582 | 'url': urls[idx],
583 | 'chunk_number': chunk_numbers[idx],
584 | 'content': code_examples[idx],
585 | 'summary': summaries[idx],
586 | 'metadata': metadatas[idx], # Store as JSON object, not string
587 | 'source_id': source_id,
588 | 'embedding': embedding
589 | })
590 |
591 | # Insert batch into Supabase with retry logic
592 | max_retries = 3
593 | retry_delay = 1.0 # Start with 1 second delay
594 |
595 | for retry in range(max_retries):
596 | try:
597 | client.table('code_examples').insert(batch_data).execute()
598 | # Success - break out of retry loop
599 | break
600 | except Exception as e:
601 | if retry < max_retries - 1:
602 | print(f"Error inserting batch into Supabase (attempt {retry + 1}/{max_retries}): {e}")
603 | print(f"Retrying in {retry_delay} seconds...")
604 | time.sleep(retry_delay)
605 | retry_delay *= 2 # Exponential backoff
606 | else:
607 | # Final attempt failed
608 | print(f"Failed to insert batch after {max_retries} attempts: {e}")
609 | # Optionally, try inserting records one by one as a last resort
610 | print("Attempting to insert records individually...")
611 | successful_inserts = 0
612 | for record in batch_data:
613 | try:
614 | client.table('code_examples').insert(record).execute()
615 | successful_inserts += 1
616 | except Exception as individual_error:
617 | print(f"Failed to insert individual record for URL {record['url']}: {individual_error}")
618 |
619 | if successful_inserts > 0:
620 | print(f"Successfully inserted {successful_inserts}/{len(batch_data)} records individually")
621 | print(f"Inserted batch {i//batch_size + 1} of {(total_items + batch_size - 1)//batch_size} code examples")
622 |
623 |
624 | def update_source_info(client: Client, source_id: str, summary: str, word_count: int):
625 | """
626 | Update or insert source information in the sources table.
627 |
628 | Args:
629 | client: Supabase client
630 | source_id: The source ID (domain)
631 | summary: Summary of the source
632 | word_count: Total word count for the source
633 | """
634 | try:
635 | # Try to update existing source
636 | result = client.table('sources').update({
637 | 'summary': summary,
638 | 'total_word_count': word_count,
639 | 'updated_at': 'now()'
640 | }).eq('source_id', source_id).execute()
641 |
642 | # If no rows were updated, insert new source
643 | if not result.data:
644 | client.table('sources').insert({
645 | 'source_id': source_id,
646 | 'summary': summary,
647 | 'total_word_count': word_count
648 | }).execute()
649 | print(f"Created new source: {source_id}")
650 | else:
651 | print(f"Updated source: {source_id}")
652 |
653 | except Exception as e:
654 | print(f"Error updating source {source_id}: {e}")
655 |
656 |
657 | def extract_source_summary(source_id: str, content: str, max_length: int = 500) -> str:
658 | """
659 | Extract a summary for a source from its content using an LLM.
660 |
661 | This function uses the OpenRouter API to generate a concise summary of the source content.
662 |
663 | Args:
664 | source_id: The source ID (domain)
665 | content: The content to extract a summary from
666 | max_length: Maximum length of the summary
667 |
668 | Returns:
669 | A summary string
670 | """
671 | # Default summary if we can't extract anything meaningful
672 | default_summary = f"Content from {source_id}"
673 |
674 | if not content or len(content.strip()) == 0:
675 | return default_summary
676 |
677 | # Get the model choice from environment variables
678 | model_choice = os.getenv("MODEL_CHOICE", "openai/gpt-4.1-nano")
679 |
680 | # Limit content length to avoid token limits
681 | truncated_content = content[:25000] if len(content) > 25000 else content
682 |
683 | # Create the prompt for generating the summary
684 | prompt = f"""<source_content>
685 | {truncated_content}
686 | </source_content>
687 |
688 | The above content is from the documentation for '{source_id}'. Please provide a concise summary (3-5 sentences) that describes what this library/tool/framework is about. The summary should help understand what the library/tool/framework accomplishes and the purpose.
689 | """
690 |
691 | try:
692 | # Get OpenRouter client for chat completions
693 | openrouter_client = get_openrouter_client()
694 |
695 | # Call the OpenRouter API to generate the summary
696 | response = openrouter_client.chat.completions.create(
697 | model=model_choice,
698 | messages=[
699 | {"role": "system", "content": "You are a helpful assistant that provides concise library/tool/framework summaries."},
700 | {"role": "user", "content": prompt}
701 | ],
702 | temperature=0.3,
703 | max_tokens=300
704 | )
705 |
706 | # Extract the generated summary
707 | summary = response.choices[0].message.content.strip()
708 |
709 | # Ensure the summary is not too long
710 | if len(summary) > max_length:
711 | summary = summary[:max_length] + "..."
712 |
713 | return summary
714 |
715 | except Exception as e:
716 | print(f"Error generating summary with LLM for {source_id}: {e}. Using default summary.")
717 | return default_summary
718 |
719 |
720 | def search_code_examples(
721 | client: Client,
722 | query: str,
723 | match_count: int = 10,
724 | filter_metadata: Optional[Dict[str, Any]] = None,
725 | source_id: Optional[str] = None
726 | ) -> List[Dict[str, Any]]:
727 | """
728 | Search for code examples in Supabase using vector similarity.
729 |
730 | Args:
731 | client: Supabase client
732 | query: Query text
733 | match_count: Maximum number of results to return
734 | filter_metadata: Optional metadata filter
735 | source_id: Optional source ID to filter results
736 |
737 | Returns:
738 | List of matching code examples
739 | """
740 | # Create a more descriptive query for better embedding match
741 | # Since code examples are embedded with their summaries, we should make the query more descriptive
742 | enhanced_query = f"Code example for {query}\n\nSummary: Example code showing {query}"
743 |
744 | # Create embedding for the enhanced query
745 | query_embedding = create_embedding(enhanced_query)
746 |
747 | # Execute the search using the match_code_examples function
748 | try:
749 | # Only include filter parameter if filter_metadata is provided and not empty
750 | params = {
751 | 'query_embedding': query_embedding,
752 | 'match_count': match_count
753 | }
754 |
755 | # Only add the filter if it's actually provided and not empty
756 | if filter_metadata:
757 | params['filter'] = filter_metadata
758 |
759 | # Add source filter if provided
760 | if source_id:
761 | params['source_filter'] = source_id
762 |
763 | result = client.rpc('match_code_examples', params).execute()
764 |
765 | return result.data
766 | except Exception as e:
767 | print(f"Error searching code examples: {e}")
768 | return []
```
--------------------------------------------------------------------------------
/knowledge_graphs/parse_repo_into_neo4j.py:
--------------------------------------------------------------------------------
```python
1 | """
2 | Direct Neo4j GitHub Code Repository Extractor
3 |
4 | Creates nodes and relationships directly in Neo4j without Graphiti:
5 | - File nodes
6 | - Class nodes
7 | - Method nodes
8 | - Function nodes
9 | - Import relationships
10 |
11 | Bypasses all LLM processing for maximum speed.
12 | """
13 |
14 | import asyncio
15 | import logging
16 | import os
17 | import subprocess
18 | import shutil
19 | from datetime import datetime, timezone
20 | from pathlib import Path
21 | from typing import List, Optional, Dict, Any, Set
22 | import ast
23 |
24 | from dotenv import load_dotenv
25 | from neo4j import AsyncGraphDatabase
26 |
27 | # Configure logging
28 | logging.basicConfig(
29 | level=logging.INFO,
30 | format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
31 | datefmt='%Y-%m-%d %H:%M:%S',
32 | )
33 | logger = logging.getLogger(__name__)
34 |
35 |
36 | class Neo4jCodeAnalyzer:
37 | """Analyzes code for direct Neo4j insertion"""
38 |
39 | def __init__(self):
40 | # External modules to ignore
41 | self.external_modules = {
42 | # Python standard library
43 | 'os', 'sys', 'json', 'logging', 'datetime', 'pathlib', 'typing', 'collections',
44 | 'asyncio', 'subprocess', 'ast', 're', 'string', 'urllib', 'http', 'email',
45 | 'time', 'uuid', 'hashlib', 'base64', 'itertools', 'functools', 'operator',
46 | 'contextlib', 'copy', 'pickle', 'tempfile', 'shutil', 'glob', 'fnmatch',
47 | 'io', 'codecs', 'locale', 'platform', 'socket', 'ssl', 'threading', 'queue',
48 | 'multiprocessing', 'concurrent', 'warnings', 'traceback', 'inspect',
49 | 'importlib', 'pkgutil', 'types', 'weakref', 'gc', 'dataclasses', 'enum',
50 | 'abc', 'numbers', 'decimal', 'fractions', 'math', 'cmath', 'random', 'statistics',
51 |
52 | # Common third-party libraries
53 | 'requests', 'urllib3', 'httpx', 'aiohttp', 'flask', 'django', 'fastapi',
54 | 'pydantic', 'sqlalchemy', 'alembic', 'psycopg2', 'pymongo', 'redis',
55 | 'celery', 'pytest', 'unittest', 'mock', 'faker', 'factory', 'hypothesis',
56 | 'numpy', 'pandas', 'matplotlib', 'seaborn', 'scipy', 'sklearn', 'torch',
57 | 'tensorflow', 'keras', 'opencv', 'pillow', 'boto3', 'botocore', 'azure',
58 | 'google', 'openai', 'anthropic', 'langchain', 'transformers', 'huggingface_hub',
59 | 'click', 'typer', 'rich', 'colorama', 'tqdm', 'python-dotenv', 'pyyaml',
60 | 'toml', 'configargparse', 'marshmallow', 'attrs', 'dataclasses-json',
61 | 'jsonschema', 'cerberus', 'voluptuous', 'schema', 'jinja2', 'mako',
62 | 'cryptography', 'bcrypt', 'passlib', 'jwt', 'authlib', 'oauthlib'
63 | }
64 |
65 | def analyze_python_file(self, file_path: Path, repo_root: Path, project_modules: Set[str]) -> Dict[str, Any]:
66 | """Extract structure for direct Neo4j insertion"""
67 | try:
68 | with open(file_path, 'r', encoding='utf-8') as f:
69 | content = f.read()
70 |
71 | tree = ast.parse(content)
72 | relative_path = str(file_path.relative_to(repo_root))
73 | module_name = self._get_importable_module_name(file_path, repo_root, relative_path)
74 |
75 | # Extract structure
76 | classes = []
77 | functions = []
78 | imports = []
79 |
80 | for node in ast.walk(tree):
81 | if isinstance(node, ast.ClassDef):
82 | # Extract class with its methods and attributes
83 | methods = []
84 | attributes = []
85 |
86 | for item in node.body:
87 | if isinstance(item, (ast.FunctionDef, ast.AsyncFunctionDef)):
88 | if not item.name.startswith('_'): # Public methods only
89 | # Extract comprehensive parameter info
90 | params = self._extract_function_parameters(item)
91 |
92 | # Get return type annotation
93 | return_type = self._get_name(item.returns) if item.returns else 'Any'
94 |
95 | # Create detailed parameter list for Neo4j storage
96 | params_detailed = []
97 | for p in params:
98 | param_str = f"{p['name']}:{p['type']}"
99 | if p['optional'] and p['default'] is not None:
100 | param_str += f"={p['default']}"
101 | elif p['optional']:
102 | param_str += "=None"
103 | if p['kind'] != 'positional':
104 | param_str = f"[{p['kind']}] {param_str}"
105 | params_detailed.append(param_str)
106 |
107 | methods.append({
108 | 'name': item.name,
109 | 'params': params, # Full parameter objects
110 | 'params_detailed': params_detailed, # Detailed string format
111 | 'return_type': return_type,
112 | 'args': [arg.arg for arg in item.args.args if arg.arg != 'self'] # Keep for backwards compatibility
113 | })
114 | elif isinstance(item, ast.AnnAssign) and isinstance(item.target, ast.Name):
115 | # Type annotated attributes
116 | if not item.target.id.startswith('_'):
117 | attributes.append({
118 | 'name': item.target.id,
119 | 'type': self._get_name(item.annotation) if item.annotation else 'Any'
120 | })
121 |
122 | classes.append({
123 | 'name': node.name,
124 | 'full_name': f"{module_name}.{node.name}",
125 | 'methods': methods,
126 | 'attributes': attributes
127 | })
128 |
129 | elif isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
130 | # Only top-level functions
131 | if not any(node in cls_node.body for cls_node in ast.walk(tree) if isinstance(cls_node, ast.ClassDef)):
132 | if not node.name.startswith('_'):
133 | # Extract comprehensive parameter info
134 | params = self._extract_function_parameters(node)
135 |
136 | # Get return type annotation
137 | return_type = self._get_name(node.returns) if node.returns else 'Any'
138 |
139 | # Create detailed parameter list for Neo4j storage
140 | params_detailed = []
141 | for p in params:
142 | param_str = f"{p['name']}:{p['type']}"
143 | if p['optional'] and p['default'] is not None:
144 | param_str += f"={p['default']}"
145 | elif p['optional']:
146 | param_str += "=None"
147 | if p['kind'] != 'positional':
148 | param_str = f"[{p['kind']}] {param_str}"
149 | params_detailed.append(param_str)
150 |
151 | # Simple format for backwards compatibility
152 | params_list = [f"{p['name']}:{p['type']}" for p in params]
153 |
154 | functions.append({
155 | 'name': node.name,
156 | 'full_name': f"{module_name}.{node.name}",
157 | 'params': params, # Full parameter objects
158 | 'params_detailed': params_detailed, # Detailed string format
159 | 'params_list': params_list, # Simple string format for backwards compatibility
160 | 'return_type': return_type,
161 | 'args': [arg.arg for arg in node.args.args] # Keep for backwards compatibility
162 | })
163 |
164 | elif isinstance(node, (ast.Import, ast.ImportFrom)):
165 | # Track internal imports only
166 | if isinstance(node, ast.Import):
167 | for alias in node.names:
168 | if self._is_likely_internal(alias.name, project_modules):
169 | imports.append(alias.name)
170 | elif isinstance(node, ast.ImportFrom) and node.module:
171 | if (node.module.startswith('.') or self._is_likely_internal(node.module, project_modules)):
172 | imports.append(node.module)
173 |
174 | return {
175 | 'module_name': module_name,
176 | 'file_path': relative_path,
177 | 'classes': classes,
178 | 'functions': functions,
179 | 'imports': list(set(imports)), # Remove duplicates
180 | 'line_count': len(content.splitlines())
181 | }
182 |
183 | except Exception as e:
184 | logger.warning(f"Could not analyze {file_path}: {e}")
185 | return None
186 |
187 | def _is_likely_internal(self, import_name: str, project_modules: Set[str]) -> bool:
188 | """Check if an import is likely internal to the project"""
189 | if not import_name:
190 | return False
191 |
192 | # Relative imports are definitely internal
193 | if import_name.startswith('.'):
194 | return True
195 |
196 | # Check if it's a known external module
197 | base_module = import_name.split('.')[0]
198 | if base_module in self.external_modules:
199 | return False
200 |
201 | # Check if it matches any project module
202 | for project_module in project_modules:
203 | if import_name.startswith(project_module):
204 | return True
205 |
206 | # If it's not obviously external, consider it internal
207 | if (not any(ext in base_module.lower() for ext in ['test', 'mock', 'fake']) and
208 | not base_module.startswith('_') and
209 | len(base_module) > 2):
210 | return True
211 |
212 | return False
213 |
214 | def _get_importable_module_name(self, file_path: Path, repo_root: Path, relative_path: str) -> str:
215 | """Determine the actual importable module name for a Python file"""
216 | # Start with the default: convert file path to module path
217 | default_module = relative_path.replace('/', '.').replace('\\', '.').replace('.py', '')
218 |
219 | # Common patterns to detect the actual package root
220 | path_parts = Path(relative_path).parts
221 |
222 | # Look for common package indicators
223 | package_roots = []
224 |
225 | # Check each directory level for __init__.py to find package boundaries
226 | current_path = repo_root
227 | for i, part in enumerate(path_parts[:-1]): # Exclude the .py file itself
228 | current_path = current_path / part
229 | if (current_path / '__init__.py').exists():
230 | # This is a package directory, mark it as a potential root
231 | package_roots.append(i)
232 |
233 | if package_roots:
234 | # Use the first (outermost) package as the root
235 | package_start = package_roots[0]
236 | module_parts = path_parts[package_start:]
237 | module_name = '.'.join(module_parts).replace('.py', '')
238 | return module_name
239 |
240 | # Fallback: look for common Python project structures
241 | # Skip common non-package directories
242 | skip_dirs = {'src', 'lib', 'source', 'python', 'pkg', 'packages'}
243 |
244 | # Find the first directory that's not in skip_dirs
245 | filtered_parts = []
246 | for part in path_parts:
247 | if part.lower() not in skip_dirs or filtered_parts: # Once we start including, include everything
248 | filtered_parts.append(part)
249 |
250 | if filtered_parts:
251 | module_name = '.'.join(filtered_parts).replace('.py', '')
252 | return module_name
253 |
254 | # Final fallback: use the default
255 | return default_module
256 |
257 | def _extract_function_parameters(self, func_node):
258 | """Comprehensive parameter extraction from function definition"""
259 | params = []
260 |
261 | # Regular positional arguments
262 | for i, arg in enumerate(func_node.args.args):
263 | if arg.arg == 'self':
264 | continue
265 |
266 | param_info = {
267 | 'name': arg.arg,
268 | 'type': self._get_name(arg.annotation) if arg.annotation else 'Any',
269 | 'kind': 'positional',
270 | 'optional': False,
271 | 'default': None
272 | }
273 |
274 | # Check if this argument has a default value
275 | defaults_start = len(func_node.args.args) - len(func_node.args.defaults)
276 | if i >= defaults_start:
277 | default_idx = i - defaults_start
278 | if default_idx < len(func_node.args.defaults):
279 | param_info['optional'] = True
280 | param_info['default'] = self._get_default_value(func_node.args.defaults[default_idx])
281 |
282 | params.append(param_info)
283 |
284 | # *args parameter
285 | if func_node.args.vararg:
286 | params.append({
287 | 'name': f"*{func_node.args.vararg.arg}",
288 | 'type': self._get_name(func_node.args.vararg.annotation) if func_node.args.vararg.annotation else 'Any',
289 | 'kind': 'var_positional',
290 | 'optional': True,
291 | 'default': None
292 | })
293 |
294 | # Keyword-only arguments (after *)
295 | for i, arg in enumerate(func_node.args.kwonlyargs):
296 | param_info = {
297 | 'name': arg.arg,
298 | 'type': self._get_name(arg.annotation) if arg.annotation else 'Any',
299 | 'kind': 'keyword_only',
300 | 'optional': True, # All kwonly args are optional unless explicitly required
301 | 'default': None
302 | }
303 |
304 | # Check for default value
305 | if i < len(func_node.args.kw_defaults) and func_node.args.kw_defaults[i] is not None:
306 | param_info['default'] = self._get_default_value(func_node.args.kw_defaults[i])
307 | else:
308 | param_info['optional'] = False # No default = required kwonly arg
309 |
310 | params.append(param_info)
311 |
312 | # **kwargs parameter
313 | if func_node.args.kwarg:
314 | params.append({
315 | 'name': f"**{func_node.args.kwarg.arg}",
316 | 'type': self._get_name(func_node.args.kwarg.annotation) if func_node.args.kwarg.annotation else 'Dict[str, Any]',
317 | 'kind': 'var_keyword',
318 | 'optional': True,
319 | 'default': None
320 | })
321 |
322 | return params
323 |
324 | def _get_default_value(self, default_node):
325 | """Extract default value from AST node"""
326 | try:
327 | if isinstance(default_node, ast.Constant):
328 | return repr(default_node.value)
329 | elif isinstance(default_node, ast.Name):
330 | return default_node.id
331 | elif isinstance(default_node, ast.Attribute):
332 | return self._get_name(default_node)
333 | elif isinstance(default_node, ast.List):
334 | return "[]"
335 | elif isinstance(default_node, ast.Dict):
336 | return "{}"
337 | else:
338 | return "..."
339 | except Exception:
340 | return "..."
341 |
342 | def _get_name(self, node):
343 | """Extract name from AST node, handling complex types safely"""
344 | if node is None:
345 | return "Any"
346 |
347 | try:
348 | if isinstance(node, ast.Name):
349 | return node.id
350 | elif isinstance(node, ast.Attribute):
351 | if hasattr(node, 'value'):
352 | return f"{self._get_name(node.value)}.{node.attr}"
353 | else:
354 | return node.attr
355 | elif isinstance(node, ast.Subscript):
356 | # Handle List[Type], Dict[K,V], etc.
357 | base = self._get_name(node.value)
358 | if hasattr(node, 'slice'):
359 | if isinstance(node.slice, ast.Name):
360 | return f"{base}[{node.slice.id}]"
361 | elif isinstance(node.slice, ast.Tuple):
362 | elts = [self._get_name(elt) for elt in node.slice.elts]
363 | return f"{base}[{', '.join(elts)}]"
364 | elif isinstance(node.slice, ast.Constant):
365 | return f"{base}[{repr(node.slice.value)}]"
366 | elif isinstance(node.slice, ast.Attribute):
367 | return f"{base}[{self._get_name(node.slice)}]"
368 | elif isinstance(node.slice, ast.Subscript):
369 | return f"{base}[{self._get_name(node.slice)}]"
370 | else:
371 | # Try to get the name of the slice, fallback to Any if it fails
372 | try:
373 | slice_name = self._get_name(node.slice)
374 | return f"{base}[{slice_name}]"
375 | except:
376 | return f"{base}[Any]"
377 | return base
378 | elif isinstance(node, ast.Constant):
379 | return str(node.value)
380 | elif isinstance(node, ast.Str): # Python < 3.8
381 | return f'"{node.s}"'
382 | elif isinstance(node, ast.Tuple):
383 | elts = [self._get_name(elt) for elt in node.elts]
384 | return f"({', '.join(elts)})"
385 | elif isinstance(node, ast.List):
386 | elts = [self._get_name(elt) for elt in node.elts]
387 | return f"[{', '.join(elts)}]"
388 | else:
389 | # Fallback for complex types - return a simple string representation
390 | return "Any"
391 | except Exception:
392 | # If anything goes wrong, return a safe default
393 | return "Any"
394 |
395 |
396 | class DirectNeo4jExtractor:
397 | """Creates nodes and relationships directly in Neo4j"""
398 |
399 | def __init__(self, neo4j_uri: str, neo4j_user: str, neo4j_password: str):
400 | self.neo4j_uri = neo4j_uri
401 | self.neo4j_user = neo4j_user
402 | self.neo4j_password = neo4j_password
403 | self.driver = None
404 | self.analyzer = Neo4jCodeAnalyzer()
405 |
406 | async def initialize(self):
407 | """Initialize Neo4j connection"""
408 | logger.info("Initializing Neo4j connection...")
409 | self.driver = AsyncGraphDatabase.driver(
410 | self.neo4j_uri,
411 | auth=(self.neo4j_user, self.neo4j_password)
412 | )
413 |
414 | # Clear existing data
415 | # logger.info("Clearing existing data...")
416 | # async with self.driver.session() as session:
417 | # await session.run("MATCH (n) DETACH DELETE n")
418 |
419 | # Create constraints and indexes
420 | logger.info("Creating constraints and indexes...")
421 | async with self.driver.session() as session:
422 | # Create constraints - using MERGE-friendly approach
423 | await session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (f:File) REQUIRE f.path IS UNIQUE")
424 | await session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (c:Class) REQUIRE c.full_name IS UNIQUE")
425 | # Remove unique constraints for methods/attributes since they can be duplicated across classes
426 | # await session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (m:Method) REQUIRE m.full_name IS UNIQUE")
427 | # await session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (f:Function) REQUIRE f.full_name IS UNIQUE")
428 | # await session.run("CREATE CONSTRAINT IF NOT EXISTS FOR (a:Attribute) REQUIRE a.full_name IS UNIQUE")
429 |
430 | # Create indexes for performance
431 | await session.run("CREATE INDEX IF NOT EXISTS FOR (f:File) ON (f.name)")
432 | await session.run("CREATE INDEX IF NOT EXISTS FOR (c:Class) ON (c.name)")
433 | await session.run("CREATE INDEX IF NOT EXISTS FOR (m:Method) ON (m.name)")
434 |
435 | logger.info("Neo4j initialized successfully")
436 |
437 | async def clear_repository_data(self, repo_name: str):
438 | """Clear all data for a specific repository"""
439 | logger.info(f"Clearing existing data for repository: {repo_name}")
440 | async with self.driver.session() as session:
441 | # Delete in specific order to avoid constraint issues
442 |
443 | # 1. Delete methods and attributes (they depend on classes)
444 | await session.run("""
445 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class)-[:HAS_METHOD]->(m:Method)
446 | DETACH DELETE m
447 | """, repo_name=repo_name)
448 |
449 | await session.run("""
450 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class)-[:HAS_ATTRIBUTE]->(a:Attribute)
451 | DETACH DELETE a
452 | """, repo_name=repo_name)
453 |
454 | # 2. Delete functions (they depend on files)
455 | await session.run("""
456 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(func:Function)
457 | DETACH DELETE func
458 | """, repo_name=repo_name)
459 |
460 | # 3. Delete classes (they depend on files)
461 | await session.run("""
462 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)-[:DEFINES]->(c:Class)
463 | DETACH DELETE c
464 | """, repo_name=repo_name)
465 |
466 | # 4. Delete files (they depend on repository)
467 | await session.run("""
468 | MATCH (r:Repository {name: $repo_name})-[:CONTAINS]->(f:File)
469 | DETACH DELETE f
470 | """, repo_name=repo_name)
471 |
472 | # 5. Finally delete the repository
473 | await session.run("""
474 | MATCH (r:Repository {name: $repo_name})
475 | DETACH DELETE r
476 | """, repo_name=repo_name)
477 |
478 | logger.info(f"Cleared data for repository: {repo_name}")
479 |
480 | async def close(self):
481 | """Close Neo4j connection"""
482 | if self.driver:
483 | await self.driver.close()
484 |
485 | def clone_repo(self, repo_url: str, target_dir: str) -> str:
486 | """Clone repository with shallow clone"""
487 | logger.info(f"Cloning repository to: {target_dir}")
488 | if os.path.exists(target_dir):
489 | logger.info(f"Removing existing directory: {target_dir}")
490 | try:
491 | def handle_remove_readonly(func, path, exc):
492 | try:
493 | if os.path.exists(path):
494 | os.chmod(path, 0o777)
495 | func(path)
496 | except PermissionError:
497 | logger.warning(f"Could not remove {path} - file in use, skipping")
498 | pass
499 | shutil.rmtree(target_dir, onerror=handle_remove_readonly)
500 | except Exception as e:
501 | logger.warning(f"Could not fully remove {target_dir}: {e}. Proceeding anyway...")
502 |
503 | logger.info(f"Running git clone from {repo_url}")
504 | subprocess.run(['git', 'clone', '--depth', '1', repo_url, target_dir], check=True)
505 | logger.info("Repository cloned successfully")
506 | return target_dir
507 |
508 | def get_python_files(self, repo_path: str) -> List[Path]:
509 | """Get Python files, focusing on main source directories"""
510 | python_files = []
511 | exclude_dirs = {
512 | 'tests', 'test', '__pycache__', '.git', 'venv', 'env',
513 | 'node_modules', 'build', 'dist', '.pytest_cache', 'docs',
514 | 'examples', 'example', 'demo', 'benchmark'
515 | }
516 |
517 | for root, dirs, files in os.walk(repo_path):
518 | dirs[:] = [d for d in dirs if d not in exclude_dirs and not d.startswith('.')]
519 |
520 | for file in files:
521 | if file.endswith('.py') and not file.startswith('test_'):
522 | file_path = Path(root) / file
523 | if (file_path.stat().st_size < 500_000 and
524 | file not in ['setup.py', 'conftest.py']):
525 | python_files.append(file_path)
526 |
527 | return python_files
528 |
529 | async def analyze_repository(self, repo_url: str, temp_dir: str = None):
530 | """Analyze repository and create nodes/relationships in Neo4j"""
531 | repo_name = repo_url.split('/')[-1].replace('.git', '')
532 | logger.info(f"Analyzing repository: {repo_name}")
533 |
534 | # Clear existing data for this repository before re-processing
535 | await self.clear_repository_data(repo_name)
536 |
537 | # Set default temp_dir to repos folder at script level
538 | if temp_dir is None:
539 | script_dir = Path(__file__).parent
540 | temp_dir = str(script_dir / "repos" / repo_name)
541 |
542 | # Clone and analyze
543 | repo_path = Path(self.clone_repo(repo_url, temp_dir))
544 |
545 | try:
546 | logger.info("Getting Python files...")
547 | python_files = self.get_python_files(str(repo_path))
548 | logger.info(f"Found {len(python_files)} Python files to analyze")
549 |
550 | # First pass: identify project modules
551 | logger.info("Identifying project modules...")
552 | project_modules = set()
553 | for file_path in python_files:
554 | relative_path = str(file_path.relative_to(repo_path))
555 | module_parts = relative_path.replace('/', '.').replace('.py', '').split('.')
556 | if len(module_parts) > 0 and not module_parts[0].startswith('.'):
557 | project_modules.add(module_parts[0])
558 |
559 | logger.info(f"Identified project modules: {sorted(project_modules)}")
560 |
561 | # Second pass: analyze files and collect data
562 | logger.info("Analyzing Python files...")
563 | modules_data = []
564 | for i, file_path in enumerate(python_files):
565 | if i % 20 == 0:
566 | logger.info(f"Analyzing file {i+1}/{len(python_files)}: {file_path.name}")
567 |
568 | analysis = self.analyzer.analyze_python_file(file_path, repo_path, project_modules)
569 | if analysis:
570 | modules_data.append(analysis)
571 |
572 | logger.info(f"Found {len(modules_data)} files with content")
573 |
574 | # Create nodes and relationships in Neo4j
575 | logger.info("Creating nodes and relationships in Neo4j...")
576 | await self._create_graph(repo_name, modules_data)
577 |
578 | # Print summary
579 | total_classes = sum(len(mod['classes']) for mod in modules_data)
580 | total_methods = sum(len(cls['methods']) for mod in modules_data for cls in mod['classes'])
581 | total_functions = sum(len(mod['functions']) for mod in modules_data)
582 | total_imports = sum(len(mod['imports']) for mod in modules_data)
583 |
584 | print(f"\\n=== Direct Neo4j Repository Analysis for {repo_name} ===")
585 | print(f"Files processed: {len(modules_data)}")
586 | print(f"Classes created: {total_classes}")
587 | print(f"Methods created: {total_methods}")
588 | print(f"Functions created: {total_functions}")
589 | print(f"Import relationships: {total_imports}")
590 |
591 | logger.info(f"Successfully created Neo4j graph for {repo_name}")
592 |
593 | finally:
594 | if os.path.exists(temp_dir):
595 | logger.info(f"Cleaning up temporary directory: {temp_dir}")
596 | try:
597 | def handle_remove_readonly(func, path, exc):
598 | try:
599 | if os.path.exists(path):
600 | os.chmod(path, 0o777)
601 | func(path)
602 | except PermissionError:
603 | logger.warning(f"Could not remove {path} - file in use, skipping")
604 | pass
605 |
606 | shutil.rmtree(temp_dir, onerror=handle_remove_readonly)
607 | logger.info("Cleanup completed")
608 | except Exception as e:
609 | logger.warning(f"Cleanup failed: {e}. Directory may remain at {temp_dir}")
610 | # Don't fail the whole process due to cleanup issues
611 |
612 | async def _create_graph(self, repo_name: str, modules_data: List[Dict]):
613 | """Create all nodes and relationships in Neo4j"""
614 |
615 | async with self.driver.session() as session:
616 | # Create Repository node
617 | await session.run(
618 | "CREATE (r:Repository {name: $repo_name, created_at: datetime()})",
619 | repo_name=repo_name
620 | )
621 |
622 | nodes_created = 0
623 | relationships_created = 0
624 |
625 | for i, mod in enumerate(modules_data):
626 | # 1. Create File node
627 | await session.run("""
628 | CREATE (f:File {
629 | name: $name,
630 | path: $path,
631 | module_name: $module_name,
632 | line_count: $line_count,
633 | created_at: datetime()
634 | })
635 | """,
636 | name=mod['file_path'].split('/')[-1],
637 | path=mod['file_path'],
638 | module_name=mod['module_name'],
639 | line_count=mod['line_count']
640 | )
641 | nodes_created += 1
642 |
643 | # 2. Connect File to Repository
644 | await session.run("""
645 | MATCH (r:Repository {name: $repo_name})
646 | MATCH (f:File {path: $file_path})
647 | CREATE (r)-[:CONTAINS]->(f)
648 | """, repo_name=repo_name, file_path=mod['file_path'])
649 | relationships_created += 1
650 |
651 | # 3. Create Class nodes and relationships
652 | for cls in mod['classes']:
653 | # Create Class node using MERGE to avoid duplicates
654 | await session.run("""
655 | MERGE (c:Class {full_name: $full_name})
656 | ON CREATE SET c.name = $name, c.created_at = datetime()
657 | """, name=cls['name'], full_name=cls['full_name'])
658 | nodes_created += 1
659 |
660 | # Connect File to Class
661 | await session.run("""
662 | MATCH (f:File {path: $file_path})
663 | MATCH (c:Class {full_name: $class_full_name})
664 | MERGE (f)-[:DEFINES]->(c)
665 | """, file_path=mod['file_path'], class_full_name=cls['full_name'])
666 | relationships_created += 1
667 |
668 | # 4. Create Method nodes - use MERGE to avoid duplicates
669 | for method in cls['methods']:
670 | method_full_name = f"{cls['full_name']}.{method['name']}"
671 | # Create method with unique ID to avoid conflicts
672 | method_id = f"{cls['full_name']}::{method['name']}"
673 |
674 | await session.run("""
675 | MERGE (m:Method {method_id: $method_id})
676 | ON CREATE SET m.name = $name,
677 | m.full_name = $full_name,
678 | m.args = $args,
679 | m.params_list = $params_list,
680 | m.params_detailed = $params_detailed,
681 | m.return_type = $return_type,
682 | m.created_at = datetime()
683 | """,
684 | name=method['name'],
685 | full_name=method_full_name,
686 | method_id=method_id,
687 | args=method['args'],
688 | params_list=[f"{p['name']}:{p['type']}" for p in method['params']], # Simple format
689 | params_detailed=method.get('params_detailed', []), # Detailed format
690 | return_type=method['return_type']
691 | )
692 | nodes_created += 1
693 |
694 | # Connect Class to Method
695 | await session.run("""
696 | MATCH (c:Class {full_name: $class_full_name})
697 | MATCH (m:Method {method_id: $method_id})
698 | MERGE (c)-[:HAS_METHOD]->(m)
699 | """,
700 | class_full_name=cls['full_name'],
701 | method_id=method_id
702 | )
703 | relationships_created += 1
704 |
705 | # 5. Create Attribute nodes - use MERGE to avoid duplicates
706 | for attr in cls['attributes']:
707 | attr_full_name = f"{cls['full_name']}.{attr['name']}"
708 | # Create attribute with unique ID to avoid conflicts
709 | attr_id = f"{cls['full_name']}::{attr['name']}"
710 | await session.run("""
711 | MERGE (a:Attribute {attr_id: $attr_id})
712 | ON CREATE SET a.name = $name,
713 | a.full_name = $full_name,
714 | a.type = $type,
715 | a.created_at = datetime()
716 | """,
717 | name=attr['name'],
718 | full_name=attr_full_name,
719 | attr_id=attr_id,
720 | type=attr['type']
721 | )
722 | nodes_created += 1
723 |
724 | # Connect Class to Attribute
725 | await session.run("""
726 | MATCH (c:Class {full_name: $class_full_name})
727 | MATCH (a:Attribute {attr_id: $attr_id})
728 | MERGE (c)-[:HAS_ATTRIBUTE]->(a)
729 | """,
730 | class_full_name=cls['full_name'],
731 | attr_id=attr_id
732 | )
733 | relationships_created += 1
734 |
735 | # 6. Create Function nodes (top-level) - use MERGE to avoid duplicates
736 | for func in mod['functions']:
737 | func_id = f"{mod['file_path']}::{func['name']}"
738 | await session.run("""
739 | MERGE (f:Function {func_id: $func_id})
740 | ON CREATE SET f.name = $name,
741 | f.full_name = $full_name,
742 | f.args = $args,
743 | f.params_list = $params_list,
744 | f.params_detailed = $params_detailed,
745 | f.return_type = $return_type,
746 | f.created_at = datetime()
747 | """,
748 | name=func['name'],
749 | full_name=func['full_name'],
750 | func_id=func_id,
751 | args=func['args'],
752 | params_list=func.get('params_list', []), # Simple format for backwards compatibility
753 | params_detailed=func.get('params_detailed', []), # Detailed format
754 | return_type=func['return_type']
755 | )
756 | nodes_created += 1
757 |
758 | # Connect File to Function
759 | await session.run("""
760 | MATCH (file:File {path: $file_path})
761 | MATCH (func:Function {func_id: $func_id})
762 | MERGE (file)-[:DEFINES]->(func)
763 | """, file_path=mod['file_path'], func_id=func_id)
764 | relationships_created += 1
765 |
766 | # 7. Create Import relationships
767 | for import_name in mod['imports']:
768 | # Try to find the target file
769 | await session.run("""
770 | MATCH (source:File {path: $source_path})
771 | OPTIONAL MATCH (target:File)
772 | WHERE target.module_name = $import_name OR target.module_name STARTS WITH $import_name
773 | WITH source, target
774 | WHERE target IS NOT NULL
775 | MERGE (source)-[:IMPORTS]->(target)
776 | """, source_path=mod['file_path'], import_name=import_name)
777 | relationships_created += 1
778 |
779 | if (i + 1) % 10 == 0:
780 | logger.info(f"Processed {i + 1}/{len(modules_data)} files...")
781 |
782 | logger.info(f"Created {nodes_created} nodes and {relationships_created} relationships")
783 |
784 | async def search_graph(self, query_type: str, **kwargs):
785 | """Search the Neo4j graph directly"""
786 | async with self.driver.session() as session:
787 | if query_type == "files_importing":
788 | target = kwargs.get('target')
789 | result = await session.run("""
790 | MATCH (source:File)-[:IMPORTS]->(target:File)
791 | WHERE target.module_name CONTAINS $target
792 | RETURN source.path as file, target.module_name as imports
793 | """, target=target)
794 | return [{"file": record["file"], "imports": record["imports"]} async for record in result]
795 |
796 | elif query_type == "classes_in_file":
797 | file_path = kwargs.get('file_path')
798 | result = await session.run("""
799 | MATCH (f:File {path: $file_path})-[:DEFINES]->(c:Class)
800 | RETURN c.name as class_name, c.full_name as full_name
801 | """, file_path=file_path)
802 | return [{"class_name": record["class_name"], "full_name": record["full_name"]} async for record in result]
803 |
804 | elif query_type == "methods_of_class":
805 | class_name = kwargs.get('class_name')
806 | result = await session.run("""
807 | MATCH (c:Class)-[:HAS_METHOD]->(m:Method)
808 | WHERE c.name CONTAINS $class_name OR c.full_name CONTAINS $class_name
809 | RETURN m.name as method_name, m.args as args
810 | """, class_name=class_name)
811 | return [{"method_name": record["method_name"], "args": record["args"]} async for record in result]
812 |
813 |
814 | async def main():
815 | """Example usage"""
816 | load_dotenv()
817 |
818 | neo4j_uri = os.environ.get('NEO4J_URI', 'bolt://localhost:7687')
819 | neo4j_user = os.environ.get('NEO4J_USER', 'neo4j')
820 | neo4j_password = os.environ.get('NEO4J_PASSWORD', 'password')
821 |
822 | extractor = DirectNeo4jExtractor(neo4j_uri, neo4j_user, neo4j_password)
823 |
824 | try:
825 | await extractor.initialize()
826 |
827 | # Analyze repository - direct Neo4j, no LLM processing!
828 | # repo_url = "https://github.com/pydantic/pydantic-ai.git"
829 | repo_url = "https://github.com/getzep/graphiti.git"
830 | await extractor.analyze_repository(repo_url)
831 |
832 | # Direct graph queries
833 | print("\\n=== Direct Neo4j Queries ===")
834 |
835 | # Which files import from models?
836 | results = await extractor.search_graph("files_importing", target="models")
837 | print(f"\\nFiles importing from 'models': {len(results)}")
838 | for result in results[:3]:
839 | print(f"- {result['file']} imports {result['imports']}")
840 |
841 | # What classes are in a specific file?
842 | results = await extractor.search_graph("classes_in_file", file_path="pydantic_ai/models/openai.py")
843 | print(f"\\nClasses in openai.py: {len(results)}")
844 | for result in results:
845 | print(f"- {result['class_name']}")
846 |
847 | # What methods does OpenAIModel have?
848 | results = await extractor.search_graph("methods_of_class", class_name="OpenAIModel")
849 | print(f"\\nMethods of OpenAIModel: {len(results)}")
850 | for result in results[:5]:
851 | print(f"- {result['method_name']}({', '.join(result['args'])})")
852 |
853 | finally:
854 | await extractor.close()
855 |
856 |
857 | if __name__ == "__main__":
858 | asyncio.run(main())
```