From e8aff9eef16120f78090bfb40bb64f4aa076d23b Mon Sep 17 00:00:00 2001 From: Anush008 Date: Thu, 18 Sep 2025 16:10:09 +0530 Subject: [PATCH] feat: Qdrant vector search backend Signed-off-by: Anush008 --- .../docs/core-concepts/semantic-search.mdx | 29 ++- poetry.lock | 221 ++++++++++++++---- pyproject.toml | 2 + src/kit/vector_searcher.py | 134 ++++++++++- 4 files changed, 335 insertions(+), 51 deletions(-) diff --git a/docs/src/content/docs/core-concepts/semantic-search.mdx b/docs/src/content/docs/core-concepts/semantic-search.mdx index 32736ab9..51ff5697 100644 --- a/docs/src/content/docs/core-concepts/semantic-search.mdx +++ b/docs/src/content/docs/core-concepts/semantic-search.mdx @@ -3,13 +3,13 @@ title: Semantic Searching --- import { Aside } from '@astrojs/starlight/components'; -Semantic search allows you to find code based on meaning rather than just keywords. Kit supports semantic code search using vector embeddings and ChromaDB (both local and cloud), enabling you to search for code using natural language queries. +Semantic search allows you to find code based on meaning rather than just keywords. Kit supports semantic code search using vector embeddings with multiple vector database backends: ChromaDB (both local and cloud) and Qdrant, enabling you to search for code using natural language queries. ## How it works - Chunks your codebase (by symbols or lines) - Embeds each chunk using your chosen model (OpenAI, HuggingFace, etc) -- Stores embeddings in a local ChromaDB vector database +- Stores embeddings in a vector database (ChromaDB or Qdrant) - Lets you search for code using natural language or code-like queries ## Quick Start @@ -249,9 +249,30 @@ To migrate from cloud to local: 1. Unset `KIT_USE_CHROMA_CLOUD` 2. Rebuild your indexes locally -#### Other Backends +#### Qdrant (Alternative Backend) -While the `VectorDBBackend` interface is designed to support other vector databases, ChromaDB (local and cloud) is the primary focus for now. If you need other backends like Faiss, please raise an issue on the kit GitHub repository. +Kit also supports [Qdrant](https://qdrant.tech/), a open-source, high-performance vector search engine that can be run locally or as a managed service. + +##### Prerequisites + +- Qdrant client library (`pip install qdrant-client`) +- Python 3.10+ +- Qdrant server (local or remote) + +##### Configuration + +Set the following environment variables to use Qdrant: + +```bash +# Required: Enable Qdrant backend +export KIT_VECTOR_BACKEND="qdrant" + +# Optional: Remote Qdrant instance (defaults to local) +export QDRANT_URL="http://localhost:6333" + +# Optional: API key for remote instances +export QDRANT_API_KEY="your-api-key" +``` ## Usage Patterns diff --git a/poetry.lock b/poetry.lock index 4d643186..42a5d0d5 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.4 and should not be changed by hand. [[package]] name = "annotated-types" @@ -114,7 +114,7 @@ description = "Function decoration for backoff and retry" optional = true python-versions = ">=3.7,<4.0" groups = ["main"] -markers = "python_version < \"3.13\" and (extra == \"ml\" or extra == \"all\")" +markers = "python_version <= \"3.12\" and (extra == \"ml\" or extra == \"all\")" files = [ {file = "backoff-2.2.1-py3-none-any.whl", hash = "sha256:63579f9a0628e06278f7e47b7d7d5b6ce20dc65c5e96a6f3ca99a6adca0396e8"}, {file = "backoff-2.2.1.tar.gz", hash = "sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba"}, @@ -892,6 +892,23 @@ files = [ {file = "h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1"}, ] +[[package]] +name = "h2" +version = "4.3.0" +description = "Pure-Python HTTP/2 protocol implementation" +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "extra == \"ml\" or extra == \"all\"" +files = [ + {file = "h2-4.3.0-py3-none-any.whl", hash = "sha256:c438f029a25f7945c69e0ccf0fb951dc3f73a5f6412981daee861431b70e2bdd"}, + {file = "h2-4.3.0.tar.gz", hash = "sha256:6c59efe4323fa18b47a632221a1888bd7fde6249819beda254aeca909f221bf1"}, +] + +[package.dependencies] +hpack = ">=4.1,<5" +hyperframe = ">=6.1,<7" + [[package]] name = "hf-xet" version = "1.1.5" @@ -914,6 +931,19 @@ files = [ [package.extras] tests = ["pytest"] +[[package]] +name = "hpack" +version = "4.1.0" +description = "Pure-Python HPACK header encoding" +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "extra == \"ml\" or extra == \"all\"" +files = [ + {file = "hpack-4.1.0-py3-none-any.whl", hash = "sha256:157ac792668d995c657d93111f46b4535ed114f0c9c8d672271bbec7eae1b496"}, + {file = "hpack-4.1.0.tar.gz", hash = "sha256:ec5eca154f7056aa06f196a557655c5b009b382873ac8d1e66e79e87535f1dca"}, +] + [[package]] name = "httpcore" version = "1.0.9" @@ -1007,6 +1037,7 @@ files = [ [package.dependencies] anyio = "*" certifi = "*" +h2 = {version = ">=3,<5", optional = true, markers = "extra == \"http2\""} httpcore = "==1.*" idna = "*" @@ -1085,6 +1116,19 @@ files = [ [package.dependencies] pyreadline3 = {version = "*", markers = "sys_platform == \"win32\" and python_version >= \"3.8\""} +[[package]] +name = "hyperframe" +version = "6.1.0" +description = "Pure-Python HTTP/2 framing" +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "extra == \"ml\" or extra == \"all\"" +files = [ + {file = "hyperframe-6.1.0-py3-none-any.whl", hash = "sha256:b03380493a519fce58ea5af42e4a42317bf9bd425596f7a0835ffce80f1a42e5"}, + {file = "hyperframe-6.1.0.tar.gz", hash = "sha256:f630908a00854a7adeabd6382b43923a4c4cd4b821fcb527e6ab9e15382a3b08"}, +] + [[package]] name = "id" version = "1.5.0" @@ -1128,7 +1172,7 @@ description = "Read metadata from Python packages" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "(extra == \"dev\" or extra == \"ml\" or extra == \"all\") and (platform_machine != \"ppc64le\" and platform_machine != \"s390x\" or extra == \"ml\" or extra == \"all\" or python_full_version < \"3.10.2\") and (python_version < \"3.12\" or extra == \"ml\" or extra == \"all\")" +markers = "(extra == \"dev\" or extra == \"ml\" or extra == \"all\") and (python_version < \"3.12\" or extra == \"ml\" or extra == \"all\") and (platform_machine != \"ppc64le\" and platform_machine != \"s390x\" or python_full_version < \"3.10.2\" or extra == \"ml\" or extra == \"all\")" files = [ {file = "importlib_metadata-8.7.0-py3-none-any.whl", hash = "sha256:e5dd1551894c77868a30651cef00984d50e1002d06942a7101d34870c5f02afd"}, {file = "importlib_metadata-8.7.0.tar.gz", hash = "sha256:d13b81ad223b890aa16c5471f2ac3056cf76c5f10f82d6f9292f0b415f389000"}, @@ -1772,7 +1816,7 @@ description = "Python package for creating and manipulating graphs and networks" optional = true python-versions = ">=3.10" groups = ["main"] -markers = "python_version < \"3.13\" and (extra == \"ml\" or extra == \"all\")" +markers = "python_version < \"3.12\" and (extra == \"ml\" or extra == \"all\")" files = [ {file = "networkx-3.4.2-py3-none-any.whl", hash = "sha256:df5d4365b724cf81b8c6a7312509d0c22386097011ad1abe274afd5e9d3bbc5f"}, {file = "networkx-3.4.2.tar.gz", hash = "sha256:307c3669428c5362aab27c8a1260aa8f47c4e91d3891f48be0141738d8d053e1"}, @@ -1793,7 +1837,7 @@ description = "Python package for creating and manipulating graphs and networks" optional = true python-versions = ">=3.11" groups = ["main"] -markers = "python_version >= \"3.13\" and (extra == \"ml\" or extra == \"all\")" +markers = "python_version >= \"3.12\" and (extra == \"ml\" or extra == \"all\")" files = [ {file = "networkx-3.5-py3-none-any.whl", hash = "sha256:0030d386a9a06dee3565298b4a734b68589749a544acbb6c412dc9e2489ec6ec"}, {file = "networkx-3.5.tar.gz", hash = "sha256:d4c6f9cf81f52d69230866796b82afbccdec3db7ae4fbd1b65ea750feed50037"}, @@ -1915,7 +1959,7 @@ description = "Fundamental package for array computing in Python" optional = false python-versions = ">=3.11" groups = ["main"] -markers = "python_version >= \"3.13\"" +markers = "python_version >= \"3.12\"" files = [ {file = "numpy-2.3.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6ea9e48336a402551f52cd8f593343699003d2353daa4b72ce8d34f66b722070"}, {file = "numpy-2.3.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5ccb7336eaf0e77c1635b232c141846493a588ec9ea777a7c24d7166bb8533ae"}, @@ -2623,6 +2667,27 @@ files = [ dev = ["pre-commit", "tox"] testing = ["coverage", "pytest", "pytest-benchmark"] +[[package]] +name = "portalocker" +version = "3.2.0" +description = "Wraps the portalocker recipe for easy usage" +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "extra == \"ml\" or extra == \"all\"" +files = [ + {file = "portalocker-3.2.0-py3-none-any.whl", hash = "sha256:3cdc5f565312224bc570c49337bd21428bba0ef363bbcf58b9ef4a9f11779968"}, + {file = "portalocker-3.2.0.tar.gz", hash = "sha256:1f3002956a54a8c3730586c5c77bf18fae4149e07eaf1c29fc3faf4d5a3f89ac"}, +] + +[package.dependencies] +pywin32 = {version = ">=226", markers = "platform_system == \"Windows\""} + +[package.extras] +docs = ["portalocker[tests]"] +redis = ["redis"] +tests = ["coverage-conditional-plugin (>=0.9.0)", "portalocker[redis]", "pytest (>=5.4.1)", "pytest-cov (>=2.8.1)", "pytest-mypy (>=0.8.0)", "pytest-rerunfailures (>=15.0)", "pytest-timeout (>=2.1.0)", "sphinx (>=6.0.0)", "types-pywin32 (>=310.0.0.20250429)", "types-redis"] + [[package]] name = "posthog" version = "5.4.0" @@ -3161,6 +3226,37 @@ files = [ {file = "python_multipart-0.0.20.tar.gz", hash = "sha256:8dd0cab45b8e23064ae09147625994d090fa46f5b0d1e13af944c331a7fa9d13"}, ] +[[package]] +name = "pywin32" +version = "311" +description = "Python for Window Extensions" +optional = true +python-versions = "*" +groups = ["main"] +markers = "(extra == \"ml\" or extra == \"all\") and platform_system == \"Windows\"" +files = [ + {file = "pywin32-311-cp310-cp310-win32.whl", hash = "sha256:d03ff496d2a0cd4a5893504789d4a15399133fe82517455e78bad62efbb7f0a3"}, + {file = "pywin32-311-cp310-cp310-win_amd64.whl", hash = "sha256:797c2772017851984b97180b0bebe4b620bb86328e8a884bb626156295a63b3b"}, + {file = "pywin32-311-cp310-cp310-win_arm64.whl", hash = "sha256:0502d1facf1fed4839a9a51ccbcc63d952cf318f78ffc00a7e78528ac27d7a2b"}, + {file = "pywin32-311-cp311-cp311-win32.whl", hash = "sha256:184eb5e436dea364dcd3d2316d577d625c0351bf237c4e9a5fabbcfa5a58b151"}, + {file = "pywin32-311-cp311-cp311-win_amd64.whl", hash = "sha256:3ce80b34b22b17ccbd937a6e78e7225d80c52f5ab9940fe0506a1a16f3dab503"}, + {file = "pywin32-311-cp311-cp311-win_arm64.whl", hash = "sha256:a733f1388e1a842abb67ffa8e7aad0e70ac519e09b0f6a784e65a136ec7cefd2"}, + {file = "pywin32-311-cp312-cp312-win32.whl", hash = "sha256:750ec6e621af2b948540032557b10a2d43b0cee2ae9758c54154d711cc852d31"}, + {file = "pywin32-311-cp312-cp312-win_amd64.whl", hash = "sha256:b8c095edad5c211ff31c05223658e71bf7116daa0ecf3ad85f3201ea3190d067"}, + {file = "pywin32-311-cp312-cp312-win_arm64.whl", hash = "sha256:e286f46a9a39c4a18b319c28f59b61de793654af2f395c102b4f819e584b5852"}, + {file = "pywin32-311-cp313-cp313-win32.whl", hash = "sha256:f95ba5a847cba10dd8c4d8fefa9f2a6cf283b8b88ed6178fa8a6c1ab16054d0d"}, + {file = "pywin32-311-cp313-cp313-win_amd64.whl", hash = "sha256:718a38f7e5b058e76aee1c56ddd06908116d35147e133427e59a3983f703a20d"}, + {file = "pywin32-311-cp313-cp313-win_arm64.whl", hash = "sha256:7b4075d959648406202d92a2310cb990fea19b535c7f4a78d3f5e10b926eeb8a"}, + {file = "pywin32-311-cp314-cp314-win32.whl", hash = "sha256:b7a2c10b93f8986666d0c803ee19b5990885872a7de910fc460f9b0c2fbf92ee"}, + {file = "pywin32-311-cp314-cp314-win_amd64.whl", hash = "sha256:3aca44c046bd2ed8c90de9cb8427f581c479e594e99b5c0bb19b29c10fd6cb87"}, + {file = "pywin32-311-cp314-cp314-win_arm64.whl", hash = "sha256:a508e2d9025764a8270f93111a970e1d0fbfc33f4153b388bb649b7eec4f9b42"}, + {file = "pywin32-311-cp38-cp38-win32.whl", hash = "sha256:6c6f2969607b5023b0d9ce2541f8d2cbb01c4f46bc87456017cf63b73f1e2d8c"}, + {file = "pywin32-311-cp38-cp38-win_amd64.whl", hash = "sha256:c8015b09fb9a5e188f83b7b04de91ddca4658cee2ae6f3bc483f0b21a77ef6cd"}, + {file = "pywin32-311-cp39-cp39-win32.whl", hash = "sha256:aba8f82d551a942cb20d4a83413ccbac30790b50efb89a75e4f586ac0bb8056b"}, + {file = "pywin32-311-cp39-cp39-win_amd64.whl", hash = "sha256:e0c4cfb0621281fe40387df582097fd796e80430597cb9944f0ae70447bacd91"}, + {file = "pywin32-311-cp39-cp39-win_arm64.whl", hash = "sha256:62ea666235135fee79bb154e695f3ff67370afefd71bd7fea7512fc70ef31e3d"}, +] + [[package]] name = "pywin32-ctypes" version = "0.2.3" @@ -3237,6 +3333,36 @@ files = [ {file = "pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e"}, ] +[[package]] +name = "qdrant-client" +version = "1.15.1" +description = "Client library for the Qdrant vector search engine" +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "extra == \"ml\" or extra == \"all\"" +files = [ + {file = "qdrant_client-1.15.1-py3-none-any.whl", hash = "sha256:2b975099b378382f6ca1cfb43f0d59e541be6e16a5892f282a4b8de7eff5cb63"}, + {file = "qdrant_client-1.15.1.tar.gz", hash = "sha256:631f1f3caebfad0fd0c1fba98f41be81d9962b7bf3ca653bed3b727c0e0cbe0e"}, +] + +[package.dependencies] +grpcio = ">=1.41.0" +httpx = {version = ">=0.20.0", extras = ["http2"]} +numpy = [ + {version = ">=1.21", markers = "python_version >= \"3.10\" and python_version < \"3.12\""}, + {version = ">=1.26", markers = "python_version == \"3.12\""}, + {version = ">=2.1.0", markers = "python_version >= \"3.13\""}, +] +portalocker = ">=2.7.0,<4.0" +protobuf = ">=3.20.0" +pydantic = ">=1.10.8,<2.0.dev0 || >2.2.0" +urllib3 = ">=1.26.14,<3" + +[package.extras] +fastembed = ["fastembed (>=0.7,<0.8)"] +fastembed-gpu = ["fastembed-gpu (>=0.7,<0.8)"] + [[package]] name = "readme-renderer" version = "44.0" @@ -3641,7 +3767,7 @@ description = "Pure-Python RSA implementation" optional = false python-versions = "<4,>=3.6" groups = ["main"] -markers = "python_version < \"3.13\"" +markers = "python_version <= \"3.12\"" files = [ {file = "rsa-4.9.1-py3-none-any.whl", hash = "sha256:68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762"}, {file = "rsa-4.9.1.tar.gz", hash = "sha256:e7bdbfdb5497da4c07dfd35530e1a902659db6ff241e39d9953cad06ebd0ae75"}, @@ -3747,7 +3873,7 @@ description = "Fundamental algorithms for scientific computing in Python" optional = true python-versions = ">=3.10" groups = ["main"] -markers = "python_version < \"3.13\" and (extra == \"ml\" or extra == \"all\")" +markers = "python_version < \"3.12\" and (extra == \"ml\" or extra == \"all\")" files = [ {file = "scipy-1.15.3-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:a345928c86d535060c9c2b25e71e87c39ab2f22fc96e9636bd74d1dbf9de448c"}, {file = "scipy-1.15.3-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:ad3432cb0f9ed87477a8d97f03b763fd1d57709f1bbde3c9369b1dff5503b253"}, @@ -3812,7 +3938,7 @@ description = "Fundamental algorithms for scientific computing in Python" optional = true python-versions = ">=3.11" groups = ["main"] -markers = "python_version >= \"3.13\" and (extra == \"ml\" or extra == \"all\")" +markers = "python_version >= \"3.12\" and (extra == \"ml\" or extra == \"all\")" files = [ {file = "scipy-1.16.0-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:deec06d831b8f6b5fb0b652433be6a09db29e996368ce5911faf673e78d20085"}, {file = "scipy-1.16.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:d30c0fe579bb901c61ab4bb7f3eeb7281f0d4c4a7b52dbf563c89da4fd2949be"}, @@ -3915,7 +4041,7 @@ description = "Easily download, build, install, upgrade, and uninstall Python pa optional = true python-versions = ">=3.9" groups = ["main"] -markers = "(extra == \"ml\" or extra == \"all\") and (python_version >= \"3.12\" or platform_system == \"Linux\") and (python_version >= \"3.12\" or platform_machine == \"x86_64\")" +markers = "(extra == \"ml\" or extra == \"all\") and (platform_system == \"Linux\" or python_version >= \"3.12\") and (platform_machine == \"x86_64\" or python_version >= \"3.12\")" files = [ {file = "setuptools-80.9.0-py3-none-any.whl", hash = "sha256:062d34222ad13e0cc312a4c02d73f059e86a4acbfbdea8f8f76b28c99f306922"}, {file = "setuptools-80.9.0.tar.gz", hash = "sha256:f36b47402ecde768dbfafc46e8e4207b4360c654f1f3bb84475f0a28628fb19c"}, @@ -4340,41 +4466,48 @@ vision = ["Pillow (>=10.0.1,<=15.0)"] [[package]] name = "tree-sitter" -version = "0.24.0" +version = "0.25.1" description = "Python bindings to the Tree-sitter parsing library" optional = false python-versions = ">=3.10" groups = ["main"] files = [ - {file = "tree-sitter-0.24.0.tar.gz", hash = "sha256:abd95af65ca2f4f7eca356343391ed669e764f37748b5352946f00f7fc78e734"}, - {file = "tree_sitter-0.24.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f3f00feff1fc47a8e4863561b8da8f5e023d382dd31ed3e43cd11d4cae445445"}, - {file = "tree_sitter-0.24.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f9691be48d98c49ef8f498460278884c666b44129222ed6217477dffad5d4831"}, - {file = "tree_sitter-0.24.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:098a81df9f89cf254d92c1cd0660a838593f85d7505b28249216661d87adde4a"}, - {file = "tree_sitter-0.24.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b26bf9e958da6eb7e74a081aab9d9c7d05f9baeaa830dbb67481898fd16f1f5"}, - {file = "tree_sitter-0.24.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:2a84ff87a2f2a008867a1064aba510ab3bd608e3e0cd6e8fef0379efee266c73"}, - {file = "tree_sitter-0.24.0-cp310-cp310-win_amd64.whl", hash = "sha256:c012e4c345c57a95d92ab5a890c637aaa51ab3b7ff25ed7069834b1087361c95"}, - {file = "tree_sitter-0.24.0-cp310-cp310-win_arm64.whl", hash = "sha256:033506c1bc2ba7bd559b23a6bdbeaf1127cee3c68a094b82396718596dfe98bc"}, - {file = "tree_sitter-0.24.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:de0fb7c18c6068cacff46250c0a0473e8fc74d673e3e86555f131c2c1346fb13"}, - {file = "tree_sitter-0.24.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a7c9c89666dea2ce2b2bf98e75f429d2876c569fab966afefdcd71974c6d8538"}, - {file = "tree_sitter-0.24.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4ddb113e6b8b3e3b199695b1492a47d87d06c538e63050823d90ef13cac585fd"}, - {file = "tree_sitter-0.24.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:01ea01a7003b88b92f7f875da6ba9d5d741e0c84bb1bd92c503c0eecd0ee6409"}, - {file = "tree_sitter-0.24.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:464fa5b2cac63608915a9de8a6efd67a4da1929e603ea86abaeae2cb1fe89921"}, - {file = "tree_sitter-0.24.0-cp311-cp311-win_amd64.whl", hash = "sha256:3b1f3cbd9700e1fba0be2e7d801527e37c49fc02dc140714669144ef6ab58dce"}, - {file = "tree_sitter-0.24.0-cp311-cp311-win_arm64.whl", hash = "sha256:f3f08a2ca9f600b3758792ba2406971665ffbad810847398d180c48cee174ee2"}, - {file = "tree_sitter-0.24.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:14beeff5f11e223c37be7d5d119819880601a80d0399abe8c738ae2288804afc"}, - {file = "tree_sitter-0.24.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:26a5b130f70d5925d67b47db314da209063664585a2fd36fa69e0717738efaf4"}, - {file = "tree_sitter-0.24.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5fc5c3c26d83c9d0ecb4fc4304fba35f034b7761d35286b936c1db1217558b4e"}, - {file = "tree_sitter-0.24.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:772e1bd8c0931c866b848d0369b32218ac97c24b04790ec4b0e409901945dd8e"}, - {file = "tree_sitter-0.24.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:24a8dd03b0d6b8812425f3b84d2f4763322684e38baf74e5bb766128b5633dc7"}, - {file = "tree_sitter-0.24.0-cp312-cp312-win_amd64.whl", hash = "sha256:f9e8b1605ab60ed43803100f067eed71b0b0e6c1fb9860a262727dbfbbb74751"}, - {file = "tree_sitter-0.24.0-cp312-cp312-win_arm64.whl", hash = "sha256:f733a83d8355fc95561582b66bbea92ffd365c5d7a665bc9ebd25e049c2b2abb"}, - {file = "tree_sitter-0.24.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0d4a6416ed421c4210f0ca405a4834d5ccfbb8ad6692d4d74f7773ef68f92071"}, - {file = "tree_sitter-0.24.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:e0992d483677e71d5c5d37f30dfb2e3afec2f932a9c53eec4fca13869b788c6c"}, - {file = "tree_sitter-0.24.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:57277a12fbcefb1c8b206186068d456c600dbfbc3fd6c76968ee22614c5cd5ad"}, - {file = "tree_sitter-0.24.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d25fa22766d63f73716c6fec1a31ee5cf904aa429484256bd5fdf5259051ed74"}, - {file = "tree_sitter-0.24.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:7d5d9537507e1c8c5fa9935b34f320bfec4114d675e028f3ad94f11cf9db37b9"}, - {file = "tree_sitter-0.24.0-cp313-cp313-win_amd64.whl", hash = "sha256:f58bb4956917715ec4d5a28681829a8dad5c342cafd4aea269f9132a83ca9b34"}, - {file = "tree_sitter-0.24.0-cp313-cp313-win_arm64.whl", hash = "sha256:23641bd25dcd4bb0b6fa91b8fb3f46cc9f1c9f475efe4d536d3f1f688d1b84c8"}, + {file = "tree-sitter-0.25.1.tar.gz", hash = "sha256:cd761ad0e4d1fc88a4b1b8083bae06d4f973acf6f5f29bbf13ea9609c1dec9c1"}, + {file = "tree_sitter-0.25.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a15d62ffdb095d509bda8c140c1ddd0cc80f0c67f92b87fcc96cd242dc0c71ea"}, + {file = "tree_sitter-0.25.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1d938f0a1ffad1206a1a569b0501345eeca81cae0a4487bb485e53768b02f24e"}, + {file = "tree_sitter-0.25.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ba8cea296de5dcb384b9a15cf526985ac8339c81da51c7e29a251d82071f5ee9"}, + {file = "tree_sitter-0.25.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:387fd2bd8657d69e877618dc199c18e2d6fe073b8f5c59e23435f3baee4ee10a"}, + {file = "tree_sitter-0.25.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:afa49e51f82b58ae2c1291d6b79ca31e0fb36c04bd9a20d89007472edfb70136"}, + {file = "tree_sitter-0.25.1-cp310-cp310-win_amd64.whl", hash = "sha256:77be45f666adf284914510794b41100decccd71dba88010c03dc2bb0d653acec"}, + {file = "tree_sitter-0.25.1-cp310-cp310-win_arm64.whl", hash = "sha256:72badac2de4e81ae0df5efe14ec5003bd4df3e48e7cf84dbd9df3a54599ba371"}, + {file = "tree_sitter-0.25.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:33a8fbaeb2b5049cf5318306ab8b16ab365828b2b21ee13678c29e0726a1d27a"}, + {file = "tree_sitter-0.25.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:797bbbc686d8d3722d25ee0108ad979bda6ad3e1025859ce2ee290e517816bd4"}, + {file = "tree_sitter-0.25.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:629fc2ae3f5954b0f6a7b42ee3fcd8f34b68ea161e9f02fa5bf709cbbac996d3"}, + {file = "tree_sitter-0.25.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4257018c42a33a7935a5150d678aac05c6594347d6a6e6dbdf7e2ef4ae985213"}, + {file = "tree_sitter-0.25.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:4027854c9feee2a3bb99642145ba04ce95d75bd17e292911c93a488cb28d0a04"}, + {file = "tree_sitter-0.25.1-cp311-cp311-win_amd64.whl", hash = "sha256:183faaedcee5f0a3ba39257fa81749709d5eb7cf92c2c050b36ff38468d1774c"}, + {file = "tree_sitter-0.25.1-cp311-cp311-win_arm64.whl", hash = "sha256:6a3800235535a2532ce392ed0d8e6f698ee010e73805bdeac2f249da8246bab6"}, + {file = "tree_sitter-0.25.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:9362a202144075b54f7c9f07e0b0e44a61eed7ee19e140c506b9e64c1d21ed58"}, + {file = "tree_sitter-0.25.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:593f22529f34dd04de02f56ea6d7c2c8ec99dfab25b58be893247c1090dedd60"}, + {file = "tree_sitter-0.25.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ebb6849f76e1cbfa223303fa680da533d452e378d5fe372598e4752838ca7929"}, + {file = "tree_sitter-0.25.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:034d4544bb0f82e449033d76dd083b131c3f9ecb5e37d3475f80ae55e8f382bd"}, + {file = "tree_sitter-0.25.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:46a9b721560070f2f980105266e28a17d3149485582cdba14d66dca14692e932"}, + {file = "tree_sitter-0.25.1-cp312-cp312-win_amd64.whl", hash = "sha256:9a5c522b1350a626dc1cbc5dc203133caeaa114d3f65e400445e8b02f18b343b"}, + {file = "tree_sitter-0.25.1-cp312-cp312-win_arm64.whl", hash = "sha256:43e7b8e83f9fc29ca62e7d2aa8c38e3fa806ff3fc65e0d501d18588dc1509888"}, + {file = "tree_sitter-0.25.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:ae1eebc175e6a50b38b0e0385cdc26e92ac0bff9b32ee1c0619bbbf6829d57ea"}, + {file = "tree_sitter-0.25.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:9e0ae03c4f132f1bffb2bc40b1bb28742785507da693ab04da8531fe534ada9c"}, + {file = "tree_sitter-0.25.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:acf571758be0a71046a61a0936cb815f15b13e0ae7ec6d08398e4aa1560b371d"}, + {file = "tree_sitter-0.25.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:632910847e3f8ae35841f92cba88a9a1b8bc56ecc1514a5affebf7951fa0fc0a"}, + {file = "tree_sitter-0.25.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a99ecef7771afb118b2a8435c8ba67ea7a085c60d5d33dc0a4794ed882e5f7df"}, + {file = "tree_sitter-0.25.1-cp313-cp313-win_amd64.whl", hash = "sha256:c1d6393454d1f9d4195c74e40a487640cd4390cd4aee90837485f932a1a0f40c"}, + {file = "tree_sitter-0.25.1-cp313-cp313-win_arm64.whl", hash = "sha256:c1d2dbf7d12426b71ff49739f599c355f4de338a5c0ab994de2a1d290f6e0b20"}, + {file = "tree_sitter-0.25.1-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:32cee52264d9ecf98885fcac0185ac63e16251b31dd8b4a3b8d8071173405f8f"}, + {file = "tree_sitter-0.25.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:ae024d8ccfef51e61c44a81af7a48670601430701c24f450bea10f4b4effd8d1"}, + {file = "tree_sitter-0.25.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d025c56c393cea660df9ef33ca60329952a1f8ee6212d21b2b390dfec08a3874"}, + {file = "tree_sitter-0.25.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:044aa23ea14f337809821bea7467f33f4c6d351739dca76ba0cbe4d0154d8662"}, + {file = "tree_sitter-0.25.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:1863d96704eb002df4ad3b738294ae8bd5dcf8cefb715da18bff6cb2d33d978e"}, + {file = "tree_sitter-0.25.1-cp314-cp314-win_amd64.whl", hash = "sha256:a40a481e28e1afdbc455932d61e49ffd4163aafa83f4a3deb717524a7786197e"}, + {file = "tree_sitter-0.25.1-cp314-cp314-win_arm64.whl", hash = "sha256:f7b68f584336b39b2deab9896b629dddc3c784170733d3409f01fe825e9c04eb"}, ] [package.extras] @@ -4883,7 +5016,7 @@ description = "Backport of pathlib-compatible object wrapper for zip files" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "(extra == \"dev\" or extra == \"ml\" or extra == \"all\") and (platform_machine != \"ppc64le\" and platform_machine != \"s390x\" or extra == \"ml\" or extra == \"all\" or python_full_version < \"3.10.2\") and (python_version < \"3.12\" or extra == \"ml\" or extra == \"all\")" +markers = "(extra == \"dev\" or extra == \"ml\" or extra == \"all\") and (python_version < \"3.12\" or extra == \"ml\" or extra == \"all\") and (platform_machine != \"ppc64le\" and platform_machine != \"s390x\" or python_full_version < \"3.10.2\" or extra == \"ml\" or extra == \"all\")" files = [ {file = "zipp-3.23.0-py3-none-any.whl", hash = "sha256:071652d6115ed432f5ce1d34c336c0adfd6a884660d1e9712a256d3d3bd4b14e"}, {file = "zipp-3.23.0.tar.gz", hash = "sha256:a07157588a12518c9d4034df3fbbee09c814741a33ff63c05fa29d26a2404166"}, @@ -4898,12 +5031,12 @@ test = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more_it type = ["pytest-mypy"] [extras] -all = ["chromadb", "sentence-transformers"] +all = ["chromadb", "qdrant-client", "sentence-transformers"] dev = ["build", "twine"] -ml = ["chromadb", "sentence-transformers"] +ml = ["chromadb", "qdrant-client", "sentence-transformers"] test-api = ["fastapi", "pytest"] [metadata] lock-version = "2.1" python-versions = ">=3.10" -content-hash = "12ee3a7f1187bc3da8e74e4ca2b15b83f11d4efdca10cd953d7de52e3557151f" +content-hash = "9eb7fecc07f162693e29ae3884f73a346cd7a0f0524c7ac2d561d681297b19bb" diff --git a/pyproject.toml b/pyproject.toml index 411e5a5e..7697535f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -91,10 +91,12 @@ test-api = [ ml = [ "sentence-transformers>=2.2.0", # For VectorSearcher and DocstringIndexer "chromadb>=1.0.0", # Vector database for semantic search (1.0+ required for Cloud) + "qdrant-client>=1.15.1" # Vector search engine for semantic search ] all = [ "sentence-transformers>=2.2.0", "chromadb>=1.0.0", + "qdrant-client>=1.15.1" # Vector search engine for semantic search ] [tool.ruff] diff --git a/src/kit/vector_searcher.py b/src/kit/vector_searcher.py index 280e5dc7..d6ffc923 100644 --- a/src/kit/vector_searcher.py +++ b/src/kit/vector_searcher.py @@ -1,5 +1,6 @@ import os import re +import uuid from typing import Any, Dict, List, Optional try: @@ -12,6 +13,12 @@ chromadb = None # type: ignore[assignment] CloudClient = None # type: ignore[assignment] +try: + from qdrant_client import QdrantClient, models +except ImportError: + QdrantClient = None # type: ignore[misc,assignment] + models = None # type: ignore[misc,assignment] + class VectorDBBackend: """ @@ -210,13 +217,118 @@ def delete(self, ids: List[str]): pass +class QdrantBackend(VectorDBBackend): + """Qdrant backend for vector search.""" + + def __init__( + self, + collection_name: Optional[str] = None, + url: Optional[str] = None, + api_key: Optional[str] = None, + persist_dir: Optional[str] = None, + ): + if QdrantClient is None or models is None: + raise ImportError("qdrant-client is not installed. Run 'pip install qdrant-client'.") + + self.is_local = persist_dir is not None + self.collection_name = collection_name or "kit_code_chunks" + self.vector_size: Optional[int] = None # Will be set when we first add embeddings + + if self.is_local: + self.client = QdrantClient(path=persist_dir) + else: + url = url or os.environ.get("QDRANT_URL", "http://localhost:6333") + api_key = api_key or os.environ.get("QDRANT_API_KEY") + + self.client = QdrantClient( + url=url, + api_key=api_key, + ) + + def _ensure_collection_exists(self, vector_size: int): + """Create collection if it doesn't exist.""" + if not self.client.collection_exists(self.collection_name): + self.client.create_collection( + collection_name=self.collection_name, + vectors_config=models.VectorParams(size=vector_size, distance=models.Distance.COSINE), + ) + + def add(self, embeddings, metadatas, ids: Optional[List[str]] = None): + """Add embeddings to the collection.""" + if not embeddings or not metadatas: + return + + # Detect vector size from first embedding if not set + if self.vector_size is None: + self.vector_size = len(embeddings[0]) + self._ensure_collection_exists(self.vector_size) + + final_ids = ids + if final_ids is None: + final_ids = [str(uuid.uuid4()) for _ in range(len(metadatas))] + elif len(final_ids) != len(embeddings): + raise ValueError("The number of IDs must match the number of embeddings and metadatas.") + + # Convert string IDs to deterministic UUIDs and store original IDs in payload + uuid_ids = [] + for point_id in final_ids: + deterministic_uuid = str(uuid.uuid5(uuid.NAMESPACE_DNS, point_id)) + uuid_ids.append(deterministic_uuid) + + points = [] + for i, (embedding, metadata, point_id, original_id) in enumerate( + zip(embeddings, metadatas, uuid_ids, final_ids) + ): + payload = metadata.copy() + payload["original_id"] = original_id + + points.append(models.PointStruct(id=point_id, vector=embedding, payload=payload)) + + self.client.upsert(collection_name=self.collection_name, points=points) + + def query(self, embedding, top_k): + """Query the collection for similar vectors.""" + search_result = self.client.query_points( + collection_name=self.collection_name, query=embedding, limit=top_k + ).points + + hits = [] + for hit in search_result: + metadata = hit.payload.copy() + metadata["score"] = hit.score + hits.append(metadata) + + return hits + + def persist(self): + """Persist data (Qdrant handles this automatically).""" + pass + + def count(self) -> int: + """Get the number of vectors in the collection.""" + return self.client.count(self.collection_name).count + + def delete(self, ids: List[str]): + """Delete vectors by ID.""" + if not ids: + return + + uuid_ids = [] + for original_id in ids: + deterministic_uuid = str(uuid.uuid5(uuid.NAMESPACE_DNS, original_id)) + uuid_ids.append(deterministic_uuid) + + self.client.delete(collection_name=self.collection_name, points_selector=models.PointIdsList(points=uuid_ids)) # type: ignore[arg-type] + + def get_default_backend(persist_dir: Optional[str] = None, collection_name: Optional[str] = None) -> VectorDBBackend: """ Factory function to create the appropriate backend based on environment configuration. - Checks KIT_USE_CHROMA_CLOUD environment variable to determine backend: - - If KIT_USE_CHROMA_CLOUD is "true" and CHROMA_API_KEY is set: uses ChromaCloudBackend - - Otherwise: uses local ChromaDBBackend + Backend selection priority: + 1. If KIT_VECTOR_BACKEND is set to "qdrant": uses QdrantBackend + 2. If KIT_USE_CHROMA_CLOUD is "true" and CHROMA_API_KEY is set: uses ChromaCloudBackend + 3. Otherwise: uses local ChromaDBBackend Args: persist_dir: Directory for local persistence (ignored for cloud backend) @@ -225,6 +337,21 @@ def get_default_backend(persist_dir: Optional[str] = None, collection_name: Opti Returns: VectorDBBackend instance """ + # Check for explicit backend selection + backend_type = os.environ.get("KIT_VECTOR_BACKEND", "").lower() + + if backend_type == "qdrant": + qdrant_url = os.environ.get("QDRANT_URL") + qdrant_api_key = os.environ.get("QDRANT_API_KEY") + + if qdrant_url or qdrant_api_key: + return QdrantBackend(collection_name=collection_name, url=qdrant_url, api_key=qdrant_api_key) + else: + if persist_dir is None: + raise ValueError("persist_dir is required for local Qdrant backend") + return QdrantBackend(collection_name=collection_name, persist_dir=persist_dir) + + # Fall back to Chroma backends use_cloud = os.environ.get("KIT_USE_CHROMA_CLOUD", "").lower() == "true" if use_cloud: @@ -257,6 +384,7 @@ def __init__(self, repo, embed_fn, backend: Optional[VectorDBBackend] = None, pe if backend is None: backend = get_default_backend(self.persist_dir, collection_name="kit_code_chunks") self.backend = backend + self.chunk_metadatas: List[Dict[str, Any]] = [] self.chunk_embeddings: List[List[float]] = []