Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions chelombus/clustering/PyQKmeans.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
year = {2017},
}
"""
from typing import Literal, overload

import joblib
from pathlib import Path
import numpy as np
Expand Down Expand Up @@ -226,6 +228,10 @@ def fit(self, X_train: np.ndarray, device: str = 'auto') -> 'PQKMeans':
self._fit_labels = None
return self

@overload
def _fit_gpu(self, X_train: np.ndarray, return_labels: Literal[False] = False) -> None: ...
@overload
def _fit_gpu(self, X_train: np.ndarray, return_labels: Literal[True]) -> np.ndarray: ...
def _fit_gpu(self, X_train: np.ndarray, return_labels: bool = False) -> np.ndarray | None:
"""GPU-accelerated training: Triton assignment + CPU centroid update."""
import time
Expand Down
16 changes: 8 additions & 8 deletions chelombus/encoder/encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def __init__(self, k:int=256, m:int=8, iterations=20):
@property
def is_trained(self) -> bool: return self.encoder_is_trained

def fit(self, X_train:NDArray, verbose:int=1, device:str='cpu', **kwargs)->None:
def fit(self, X_train:NDArray, verbose:int=1, device:str='auto', **kwargs)->None:
""" KMeans fitting of every subvector matrix from the X_train matrix. Populates
the codebook by storing the cluster centers of every subvector

Expand All @@ -65,18 +65,18 @@ def fit(self, X_train:NDArray, verbose:int=1, device:str='cpu', **kwargs)->None:
X_train(np.array): Input matrix to train the encoder.
verbose(int): Level of verbosity. Default is 1
device: 'cpu' for sklearn KMeans, 'gpu' for torch-based KMeans on CUDA,
'auto' to pick GPU if available. Default is 'cpu'.
'auto' picks GPU when available. Default is 'auto'.
**kwargs: Optional keyword arguments passed to the underlying KMeans `fit()` function
(only used on the CPU path).
"""

assert X_train.ndim == 2, "The input can only be a matrix (X.ndim == 2)"
N, D = X_train.shape # N number of input vectors, D dimension of the vectors
assert self.k < N, "the number of training vectors (N for N,D = X_train.shape) should be more than the number of centroids (K)"
assert D % self.m == 0, f"Vector (fingeprint) dimension should be divisible by the number of subvectors (m). Got {D} / {self.m}"
assert D % self.m == 0, f"Vector (fingerprint) dimension should be divisible by the number of subvectors (m). Got {D} / {self.m}"
self.D_subvector = int(D / self.m) # Dimension of the subvector.
self.og_D = D # We save the original dimensions of the input vector (fingerprint) for later use
assert self.encoder_is_trained == False, "Encoder can only be fitted once"
assert not self.encoder_is_trained, "Encoder can only be fitted once"

self.codewords= np.zeros((self.m, self.k, self.D_subvector), dtype=np.float32)

Expand Down Expand Up @@ -225,7 +225,7 @@ def transform(self, X:NDArray, verbose:int=1, device:str='auto', **kwargs) -> ND
for the corresponding subvector.
"""

assert self.encoder_is_trained == True, "PQEncoder must be trained before calling transform"
assert self.encoder_is_trained, "PQEncoder must be trained before calling transform"

use_gpu = (device == 'gpu') or (device == 'auto' and _GPU_AVAILABLE)
if use_gpu:
Expand Down Expand Up @@ -302,18 +302,18 @@ def _transform_gpu(self, X: NDArray) -> NDArray:
del cw_gpu
return pq_codes

def fit_transform(self, X:NDArray, verbose:int=1, device:str='cpu', **kwargs) -> NDArray:
def fit_transform(self, X:NDArray, verbose:int=1, device:str='auto', **kwargs) -> NDArray:
"""Fit and transforms the input matrix `X` into its PQ-codes

The encoder is trained on the matrix and then for each sample in X,
the input vector is split into `m` equal-sized vectors subvectors composed
byt the index of the closest centroid. Returns a compact representation of X,
by the index of the closest centroid. Returns a compact representation of X,
where each sample is encoded as a sequence of centroid indices (i.e PQcodes)

Args:
X (np.array): Input data matrix of shape (n_samples, n_features)
verbose (int, optional): Level of verbosity. Defaults to 1.
device: 'cpu', 'gpu', or 'auto'. Default is 'cpu'.
device: 'cpu', 'gpu', or 'auto' (picks GPU when available). Default is 'auto'.
**kwargs: Optional keyword. These arguments will be passed to the underlying KMeans
predict() function.

Expand Down
Loading