diff --git a/pykeops/benchmarks/plot_benchmark_KNN.py b/pykeops/benchmarks/plot_benchmark_KNN.py
index 864bb4dcd..087cf536d 100644
--- a/pykeops/benchmarks/plot_benchmark_KNN.py
+++ b/pykeops/benchmarks/plot_benchmark_KNN.py
@@ -62,8 +62,6 @@
 
 use_cuda = torch.cuda.is_available()
 
-
-##############################################
 # We then specify the values of K that we will inspect:
 
 Ks = [1, 10, 50, 100]  # Numbers of neighbors to find
@@ -386,6 +384,69 @@ def f(x_test):
     return fit
 
 
+############################################################################
+# KeOps IVF-Flat implementation
+# --------------------------------------
+#
+# KeOps IVF-Flat is an approximation method that leverages the KeOps engine. It
+# uses the IVF-Flat approximation algorithm comprising 4 steps: (1) split the
+# training data into clusters using k-means, (2) find the 'a' nearest clusters
+# to each cluster, (3) find the nearest cluster to each query point, and (4)
+# perform the nearest neighbour search within only these nearest clusters, and
+# the 'a' nearest clusters to each of these clusters. (1) and (2) are performed
+# during fitting, while (3) and (4) are performed during query time. Steps (3)
+# and (4) achieve time savings during query time by reducing the amount of
+# pair-wise distance calculations.
+
+from pykeops.torch.knn import IVF
+
+
+def KNN_KeOps_ivf_flat(K, metric="euclidean", clusters=100, a=10, **kwargs):
+
+    # Setup the K-NN estimator:
+    if metric == "angular":
+        metric = "angular_full"  # alternative metric for non-normalised data
+    KNN = IVF(k=K, metric=metric)
+
+    def fit(x_train):
+        x_train = tensor(x_train)
+        start = timer()
+        KNN.fit(x_train, clusters=clusters, a=a)
+        elapsed = timer() - start
+
+        def f(x_test):
+            x_test = tensor(x_test)
+            start = timer()
+            indices = KNN.kneighbors(x_test)
+            elapsed = timer() - start
+            indices = indices.cpu().numpy()
+
+            return indices, elapsed
+
+        return f, elapsed
+
+    return fit
+
+
+##################################################################
+# The time savings and accuracies achieved depend on the underlyng data
+# structure, the number of clusters chosen and the 'a' parameter. The algorithm
+# speed suffers for clusters >200. Reducing the proportion of clusters searched
+# over (i.e. the a/clusters value) increases the algorithm speed, but lowers its
+# accuracy. For structured data (e.g. MNIST), high accuracies >90% can be
+# reached by just searching over 10% of clusters. However, for uniformly
+# distributed random data, over 80% of the clusters will need to be searched
+# over to attain >90% accuracy.
+
+# Here, we propose 2 sets of parameters that work well on real data (e.g.
+# MNIST, GloVe):
+
+KNN_KeOps_gpu_IVFFlat_fast = partial(KNN_KeOps_ivf_flat, clusters=10, a=1)
+KNN_KeOps_gpu_IVFFlat_slow = partial(KNN_KeOps_ivf_flat, clusters=200, a=40)
+
+##############################################
+
+
 ################################################################################
 # SciKit-Learn tree-based and bruteforce methods
 # -----------------------------------------------------
@@ -631,7 +692,9 @@ def run_KNN_benchmark(name, loops=[1]):
         routines = [(KNN_JAX_batch_loop, "JAX (small batches, GPU)", {})]
     else:
         routines = [
-            (KNN_KeOps, "KeOps (GPU)", {}),
+            (KNN_KeOps, "KeOps-Flat (GPU)", {}),
+            (KNN_KeOps_gpu_IVFFlat_fast, "KeOps-IVF-Flat (GPU, nprobe=1)", {}),
+            (KNN_KeOps_gpu_IVFFlat_slow, "KeOps-IVF-Flat (GPU, nprobe=40)", {}),
             (KNN_faiss_gpu_Flat, "FAISS-Flat (GPU)", {}),
             (KNN_faiss_gpu_IVFFlat_fast, "FAISS-IVF-Flat (GPU, nprobe=1)", {}),
             (KNN_faiss_gpu_IVFFlat_slow, "FAISS-IVF-Flat (GPU, nprobe=40)", {}),
@@ -660,6 +723,8 @@ def run_KNN_benchmark(name, loops=[1]):
         legend_location="upper right",
         linestyles=[
             "o-",
+            "+-.",
+            "x-.",
             "s-",
             "^:",
             "<:",
diff --git a/pykeops/common/ivf.py b/pykeops/common/ivf.py
new file mode 100644
index 000000000..3c45056ca
--- /dev/null
+++ b/pykeops/common/ivf.py
@@ -0,0 +1,228 @@
+class GenericIVF:
+    """Abstract class to compute IVF functions.
+
+    End-users should use 'pykeops.numpy.ivf' or 'pykeops.torch.ivf'.
+
+    """
+
+    def __init__(self, k, metric, normalise, lazytensor):
+
+        self.__k = k
+        self.__normalise = normalise
+        self.__update_metric(metric)
+        self.__LazyTensor = lazytensor
+        self.__c = None
+
+    def __update_metric(self, metric):
+        """
+        Update the metric used in the class.
+        """
+        if isinstance(metric, str):
+            self.__distance = self.tools.distance_function(metric)
+            self.__metric = metric
+        elif callable(metric):
+            self.__distance = metric
+            self.__metric = "custom"
+        else:
+            raise ValueError(
+                f"The 'metric' argument has type {type(metric)}, but only strings and functions are supported."
+            )
+
+    @property
+    def metric(self):
+        """Returns the metric used in the search."""
+        return self.__metric
+
+    @property
+    def clusters(self):
+        """Returns the clusters obtained through K-Means."""
+        if self.__c is not None:
+            return self.__c
+        else:
+            raise NotImplementedError("Run .fit() first!")
+
+    def __get_tools(self):
+        pass
+
+    def __k_argmin(self, x, y, k=1):
+        """
+        Compute the k nearest neighbors between x and y, for various k.
+        """
+        x_i = self.__LazyTensor(
+            self.tools.to(self.tools.unsqueeze(x, 1), self.__device)
+        )
+        y_j = self.__LazyTensor(
+            self.tools.to(self.tools.unsqueeze(y, 0), self.__device)
+        )
+
+        D_ij = self.__distance(x_i, y_j)
+        if not self.tools.is_tensor(x):
+            if self.__backend:
+                D_ij.backend = self.__backend
+
+        if k == 1:
+            return self.tools.view(self.tools.long(D_ij.argmin(dim=1)), -1)
+        else:
+            return self.tools.long(D_ij.argKmin(K=k, dim=1))
+
+    def __sort_clusters(self, x, lab, store_x=True):
+        """
+        Takes in a dataset and sorts according to its labels.
+
+        Args:
+          x ((N, D) array): Input dataset of N points in dimension D.
+          lab ((N) array): Labels for each point in x.
+          store_x (bool): Store the sort permutations for use later.
+        """
+        lab, perm = self.tools.sort(self.tools.view(lab, -1))
+        if store_x:
+            self.__x_perm = perm
+        else:
+            self.__y_perm = perm
+        return x[perm], lab
+
+    def __unsort(self, indices):
+        """
+        Given an input indices, undo and prior sorting operations.
+        First, select the true x indices with __x_perm[indices].
+        Then, use index_select to choose the indices in true x, for each true y.
+        """
+        return self.tools.index_select(
+            self.__x_perm[indices], 0, self.__y_perm.argsort()
+        )
+
+    def _fit(
+        self,
+        x,
+        clusters=50,
+        a=5,
+        Niter=15,
+        device=None,
+        backend=None,
+        approx=False,
+        n=50,
+    ):
+        """
+        Fits the main dataset
+        """
+
+        # basic checks that the hyperparameters are as expected
+        if type(clusters) != int:
+            raise ValueError("Clusters must be an integer")
+        if clusters >= len(x):
+            raise ValueError("Number of clusters must be less than length of dataset")
+        if type(a) != int:
+            raise ValueError("Number of clusters to search over must be an integer")
+        if a > clusters:
+            raise ValueError(
+                "Number of clusters to search over must be less than total number of clusters"
+            )
+        if len(x.shape) != 2:
+            raise ValueError("Input must be a 2D array")
+        # normalise the input if selected
+        if self.__normalise:
+            x = x / self.tools.repeat(self.tools.norm(x, 2, -1), x.shape[1]).reshape(
+                -1, x.shape[1]
+            )
+
+        # if we want to use the approximation in Kmeans, and our metric is angular, switch to full angular metric
+        if approx and self.__metric == "angular":
+            self.__update_metric("angular_full")
+
+        x = self.tools.contiguous(x)
+        self.__device = device
+        self.__backend = backend
+
+        # perform K-Means
+        cl, c = self.tools.kmeans(
+            x,
+            self.__distance,
+            clusters,
+            Niter=Niter,
+            device=self.__device,
+            approx=approx,
+            n=n,
+        )
+
+        self.__c = c
+        # perform one final cluster assignment, since K-Means ends on cluster update step
+        cl = self.__assign(x)
+
+        # obtain the nearest clusters to each cluster
+        ncl = self.__k_argmin(c, c, k=a)
+        self.__x_ranges, _, _ = self.tools.cluster_ranges_centroids(x, cl)
+
+        x, x_labels = self.__sort_clusters(x, cl, store_x=True)
+        self.__x = x
+        r = self.tools.repeat(self.tools.arange(clusters, device=self.__device), a)
+        # create a [clusters, clusters] sized boolean matrix
+        self.__keep = self.tools.to(
+            self.tools.zeros([clusters, clusters], dtype=bool), self.__device
+        )
+        # set the indices of the nearest clusters to each cluster to True
+        self.__keep[r, ncl.flatten()] = True
+
+        return self
+
+    def __assign(self, x, c=None):
+        """
+        Assigns nearest clusters to a dataset.
+        If no clusters are given, uses the clusters found through K-Means.
+
+        Args:
+          x ((N, D) array): Input dataset of N points in dimension D.
+          c ((M, D) array): Cluster locations of M points in dimension D.
+        """
+        if c is None:
+            c = self.__c
+        return self.__k_argmin(x, c)
+
+    def _kneighbors(self, y):
+        """
+        Obtain the k nearest neighbors of the query dataset y.
+        """
+        if self.__x is None:
+            raise ValueError("Input dataset not fitted yet! Call .fit() first!")
+        if self.__device and self.tools.device(y) != self.__device:
+            raise ValueError("Input dataset and query dataset must be on same device")
+        if len(y.shape) != 2:
+            raise ValueError("Query dataset must be a 2D tensor")
+        if self.__x.shape[-1] != y.shape[-1]:
+            raise ValueError("Query and dataset must have same dimensions")
+        if self.__normalise:
+            y = y / self.tools.repeat(self.tools.norm(y, 2, -1), y.shape[1]).reshape(
+                -1, y.shape[1]
+            )
+        y = self.tools.contiguous(y)
+        # assign y to the previously found clusters and get labels
+        y_labels = self.__assign(y)
+
+        # obtain y_ranges
+        y_ranges, _, _ = self.tools.cluster_ranges_centroids(y, y_labels)
+        self.__y_ranges = y_ranges
+
+        # sort y contiguous
+        y, y_labels = self.__sort_clusters(y, y_labels, store_x=False)
+
+        # perform actual knn computation
+        x_i = self.__LazyTensor(self.tools.unsqueeze(self.__x, 0))
+        y_j = self.__LazyTensor(self.tools.unsqueeze(y, 1))
+        D_ij = self.__distance(y_j, x_i)
+        ranges_ij = self.tools.from_matrix(y_ranges, self.__x_ranges, self.__keep)
+        D_ij.ranges = ranges_ij
+        indices = D_ij.argKmin(K=self.__k, axis=1)
+        return self.__unsort(indices)
+
+    def brute_force(self, x, y, k=5):
+        """Performs a brute force search with KeOps
+
+        Args:
+          x ((N, D) array): Input dataset of N points in dimension D.
+          y ((M, D) array): Query dataset of M points in dimension D.
+          k (int): Number of nearest neighbors to obtain.
+
+        """
+        x_LT = self.__LazyTensor(self.tools.unsqueeze(x, 0))
+        y_LT = self.__LazyTensor(self.tools.unsqueeze(y, 1))
+        D_ij = self.__distance(y_LT, x_LT)
+        return D_ij.argKmin(K=k, axis=1)
diff --git a/pykeops/common/nystrom_generic.py b/pykeops/common/nystrom_generic.py
new file mode 100644
index 000000000..bcd84fd33
--- /dev/null
+++ b/pykeops/common/nystrom_generic.py
@@ -0,0 +1,222 @@
+import numpy as np
+import pykeops
+from typing import TypeVar, Union
+import warnings
+
+# Generic placeholder for numpy and torch variables.
+generic_array = TypeVar("generic_array")
+GenericLazyTensor = TypeVar("GenericLazyTensor")
+
+
+class GenericNystroem:
+    """
+    Super class defining the Nystrom operations. The end user should
+    use numpy.nystrom or torch.nystrom subclasses.
+
+    """
+
+    def __init__(
+        self,
+        n_components: int = 100,
+        kernel: Union[str, callable] = "rbf",
+        sigma: float = None,
+        inv_eps: float = None,
+        verbose: bool = False,
+        random_state: Union[None, int] = None,
+    ):
+
+        """
+        Args:
+             n_components: int: how many samples to select from data.
+            kernel: str: type of kernel to use. Current options = {rbf:Gaussian,
+                exp: exponential}.
+            sigma: float: exponential constant for the RBF and exponential kernels.
+            inv_eps: float: additive invertibility constant for matrix decomposition.
+            verbose: boolean: set True to print details.
+            random_state: int: to set a random seed for the random sampling of the
+                samples. To be used when reproducibility is needed.
+        """
+        self.n_components = n_components
+        self.kernel = kernel
+        self.sigma = sigma
+        self.dtype = None
+        self.verbose = verbose
+        self.random_state = random_state
+        self.tools = None
+        self.lazy_tensor = None
+
+        if inv_eps:
+            self.inv_eps = inv_eps
+        else:
+            self.inv_eps = 1e-8
+
+    def fit(self, x: generic_array) -> "GenericNystroem":
+        """
+        Args:
+            x: generic_array: array or tensor of shape (n_samples, n_features)
+        Returns:
+            Fitted instance of the class
+        """
+        self.dtype = x.dtype
+
+        # Basic checks
+        assert self.tools.is_tensor(
+            x
+        ), "Input to fit(.) must be an array\
+        if using numpy and tensor if using torch."
+        assert (
+            x.shape[0] >= self.n_components
+        ), "The application needs\
+        X.shape[0] >= n_components."
+        if self.kernel == "exp" and not (self.sigma is None):
+            assert self.sigma > 0, "Should be working with decaying exponential."
+
+        # Set default sigma
+        if self.sigma is None:
+            self.sigma = np.sqrt(x.shape[1]) / np.sqrt(2)
+
+        # Update dtype
+        self._update_dtype()
+        # Number of samples
+        n_samples = x.shape[0]
+
+        # Define basis
+        rnd = self._check_random_state(self.random_state)
+        inds = rnd.permutation(n_samples)
+        basis_inds = inds[: self.n_components]
+        basis = x[basis_inds]
+
+        # Build smaller kernel
+        basis_kernel = self._pairwise_kernels(basis, dense=True)
+
+        # Decomposition is an abstract method that needs to be defined in each class
+        self.normalization = self._decomposition_and_norm(basis_kernel)
+        self.components_ = basis
+        self.component_indices_ = inds
+
+        return self
+
+    def _decomposition_and_norm(self, X: GenericLazyTensor):
+        """
+        To be defined in the subclass
+
+        Args:
+          X: GenericLazyTensor:
+        """
+        raise NotImplementedError(
+            "Subclass must implement the method _decomposition_and_norm."
+        )
+
+    def _get_kernel(self, x: generic_array, y: generic_array) -> generic_array:
+        """
+        To be implmented in the subclass.
+
+        Args:
+            x: generic_array
+            y: generic_array
+
+        Returns:
+            K: generic_array: dense kernel array
+
+        """
+        raise NotImplementedError("Subclass must implement the method _get_kernel.")
+
+    def transform(self, x: generic_array, dense=True) -> generic_array:
+        """Applies transform on the data mapping it to the feature space
+        which supports the approximated kernel.
+
+        Args:
+            X: generic_array: data to transform, dim: n_samples n x m
+        Returns
+            X(array): data after transformation, dim: n_samples n x n_components D
+            x:generic_array:
+          dense:  (Default value = True)
+
+        Returns:
+
+        """
+        if type(x) == np.ndarray and not dense:
+            warnings.warn("For Numpy transform it is best to use dense=True")
+
+        K_nq = self._pairwise_kernels(x, self.components_, dense=dense)
+        x_new = K_nq @ self.normalization
+        return x_new
+
+    def _pairwise_kernels(self, x: generic_array, y: generic_array = None, dense=False):
+        """Helper function to build kernel
+
+                y(np.array or torch.tensor): array/tensor N x D
+                dense(bool): False to work with lazy tensor reduction,
+                              True to work with dense arrays/tensors
+
+        Args:
+          x:generic_array: data, shape N x M
+          y:generic_array:  (Default value = None), if given N x D array
+          dense: boolean: (Default value = False). Use False to return a
+            to return a dense generic_array. Use True to return a LazyTensor
+            version.
+
+        Returns:
+          LazyTensor: if dense == False
+          dense array: if dense == True
+
+        """
+
+        if y is None:
+            y = x
+        x = x / (np.sqrt(2) * self.sigma)
+        y = y / (np.sqrt(2) * self.sigma)
+
+        x_i, x_j = self.tools.contiguous(x[:, None, :]), self.tools.contiguous(
+            y[None, :, :]
+        )  # (N, 1, M), (1, N, M) or (1, N, D)
+
+        if self.kernel == "rbf":
+            if dense:
+                K_ij = self._get_kernel(x, y)
+
+            else:
+                x_i, x_j = self.lazy_tensor(x_i), self.lazy_tensor(x_j)
+                D_ij = ((x_i - x_j) ** 2).sum(dim=2)
+                K_ij = (-D_ij).exp()
+
+        elif self.kernel == "exp":
+            if dense:
+                K_ij = self._get_kernel(x, y, kernel="exp")
+
+            else:
+                x_i, x_j = self.lazy_tensor(x_i), self.lazy_tensor(x_j)
+                K_ij = (-(((x_i - x_j) ** 2).sum(-1)).sqrt()).exp()
+
+        # computation with custom kernel
+        else:
+            print("Please note that computations on custom kernels are dense-only.")
+            K_ij = self.kernel(x_i, x_j)
+
+        return K_ij  # (N, N)
+
+    def _update_dtype(self) -> None:
+        """Helper function that sets dtype to that of
+        the given data in the fitting step. Fixes inv_eps data type to
+        that of input data.
+        """
+        self.inv_eps = np.array([self.inv_eps]).astype(self.dtype)[0]
+
+    def _check_random_state(self, seed: Union[None, int]) -> None:
+        """
+        Set/get np.random.RandomState instance for permutation.
+
+        Args:
+            seed: Union[None: int]:
+
+        Returns:
+            numpy random state
+        """
+
+        if seed is None:
+            return np.random.mtrand._rand
+
+        elif type(seed) == int:
+            return np.random.RandomState(seed)
+
+        raise ValueError(f"Seed {seed} must be None or an integer.")
diff --git a/pykeops/numpy/__init__.py b/pykeops/numpy/__init__.py
index 4a85386fc..61a0ebc61 100644
--- a/pykeops/numpy/__init__.py
+++ b/pykeops/numpy/__init__.py
@@ -5,7 +5,8 @@
 ##########################################################
 # Import pyKeOps routines
 
-
+from .nystrom.nystrom import Nystrom
+from .knn.ivf import IVF
 from .generic.generic_red import Genred
 from .operations import KernelSolve
 from .convolutions.radial_kernel import RadialKernelConv, RadialKernelGrad1conv
@@ -19,6 +20,8 @@
 
 __all__ = sorted(
     [
+        "Nystrom",
+        "IVF",
         "Genred",
         "generic_sum",
         "generic_logsumexp",
diff --git a/pykeops/numpy/knn/__init__.py b/pykeops/numpy/knn/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/pykeops/numpy/knn/ivf.py b/pykeops/numpy/knn/ivf.py
new file mode 100644
index 000000000..df0ff6b7a
--- /dev/null
+++ b/pykeops/numpy/knn/ivf.py
@@ -0,0 +1,65 @@
+from pykeops.common.ivf import GenericIVF
+import numpy as np
+
+
+class IVF(GenericIVF):
+    """IVF-Flat is a KNN approximation algorithm.
+
+    The original FAISS paper can found at https://arxiv.org/abs/1702.08734 .
+    """
+
+    def __init__(self, k=5, metric="euclidean", normalise=False):
+        """Initialise the IVF-Flat class.
+
+        IVF-Flat is a KNN approximation algorithm.
+        It first clusters the input data.
+        During query time, it searches only within the closest clusters.
+
+        Args:
+          k (int): Number of nearest neighbours to obtain.
+          metric (str,function): Metric to use.
+            Currently, "euclidean", "manhattan" and "angular" are directly supported.
+            Custom metrics are not supported in numpy, please use torch version instead.
+            For more information, refer to the tutorial.
+          normalise (bool): Whether or not to normalise all input data to norm 1.
+            This is used mainly for angular metric.
+            In place of this, "angular_full" metric may be used instead.
+
+        """
+        from pykeops.numpy import LazyTensor
+
+        self.__get_tools()
+        super().__init__(k=k, metric=metric, normalise=normalise, lazytensor=LazyTensor)
+
+    def __get_tools(self):
+        from pykeops.numpy.utils import numpytools
+
+        self.tools = numpytools
+
+    def fit(self, x, clusters=50, a=5, Niter=15, backend="CPU", approx=False):
+        """Fits a dataset to perform the nearest neighbour search over.
+
+        K-Means is performed on the dataset to obtain clusters.
+        Then the closest clusters to each cluster is stored for use during query time.
+
+        Args:
+          x ((N, D) array): Input dataset of N points in dimension D.
+            Where N is the number of points and D is the number of dimensions.
+          clusters (int): Total number of clusters to create in K-Means.
+          a (int): Number of clusters to search over, must be less than total number of clusters created.
+          Niter (int): Number of iterations to run in K-Means algorithm.
+
+        """
+        if approx:
+            raise NotImplementedError(
+                "Approximation in K-Means not supported for numpy"
+            )
+        return self._fit(x, clusters=clusters, a=a, Niter=Niter, backend=backend)
+
+    def kneighbors(self, y):
+        """Obtains the nearest neighbors for an input dataset from the fitted dataset.
+
+        Args:
+          y ((M, D) array): Query dataset of M points in dimension D.
+        """
+        return self._kneighbors(y)
diff --git a/pykeops/numpy/nystrom/__init__.py b/pykeops/numpy/nystrom/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/pykeops/numpy/nystrom/nystrom.py b/pykeops/numpy/nystrom/nystrom.py
new file mode 100644
index 000000000..f4f0acbc6
--- /dev/null
+++ b/pykeops/numpy/nystrom/nystrom.py
@@ -0,0 +1,103 @@
+import numpy as np
+
+from scipy.linalg import eigh
+from scipy.sparse.linalg import aslinearoperator
+
+from pykeops.common.nystrom_generic import GenericNystroem
+
+from typing import List
+
+
+class Nystrom(GenericNystroem):
+    """
+    Nystroem class to work with Numpy arrays.
+    """
+
+    def __init__(
+        self,
+        n_components=100,
+        kernel="rbf",
+        sigma: float = None,
+        inv_eps: float = None,
+        verbose=False,
+        random_state=None,
+        eigvals: List[int] = None,
+    ):
+
+        """
+        Args:
+             n_components: int: how many samples to select from data.
+            kernel: str: type of kernel to use. Current options = {rbf:Gaussian,
+                exp: exponential}.
+            sigma: float: exponential constant for the RBF and exponential kernels.
+            inv_eps: float: additive invertibility constant for matrix decomposition.
+            verbose: boolean: set True to print details.
+            random_state: int: to set a random seed for the random sampling of the
+                samples. To be used when reproducibility is needed.
+            eigvals: eigenvalues index interval [a,b] for constructed K_q,
+                where 0 <= a < b < length of K_q
+
+        """
+        super().__init__(n_components, kernel, sigma, inv_eps, verbose, random_state)
+        from pykeops.numpy.utils import numpytools
+        from pykeops.numpy import LazyTensor
+
+        self.tools = numpytools
+        self.lazy_tensor = LazyTensor
+        self.eigvals = eigvals
+
+        if eigvals:
+            assert eigvals[0] < eigvals[1], "eigvals = [a,b] needs a < b"
+            assert (
+                eigvals[1] < n_components
+            ), "max eigenvalue index needs to be less\
+            than size of K_q = n_components"
+
+    def _decomposition_and_norm(self, X: np.array) -> np.array:
+        """
+        Computes K_q^{-1/2}.
+
+        Returns:
+            K_q^{-1/2}: np.array
+        """
+
+        X = (
+            X + np.eye(X.shape[0], dtype=self.dtype) * self.inv_eps
+        )  # (Q,Q)  Q - num_components
+        S, U = eigh(X, eigvals=self.eigvals)  # (Q,), (Q,Q)
+        S = np.maximum(S, 1e-12)
+
+        return np.dot(U / np.sqrt(S), U.T)  # (Q,Q)
+
+    def _get_kernel(self, x: np.array, y: np.array, kernel=None) -> np.array:
+
+        D_xx = np.sum((x ** 2), axis=-1)[:, None]  # (N,1)
+        D_xy = x @ y.T  # (N,D) @ (D,M) = (N,M)
+        D_yy = np.sum((y ** 2), axis=-1)[None, :]  # (1,M)
+        D_xy = D_xx - 2 * D_xy + D_yy  # (N,M)
+        if kernel == "exp":
+            D_xy = np.sqrt(D_xy)
+        return np.exp(-D_xy)  # (N,M)
+
+    def K_approx(self, x: np.array) -> "LinearOperator":
+        """
+        Method to return Nystrom approximation to the kernel.
+
+        Args:
+            x: np.array: data used in fit(.) function.
+        Returns
+            K: LinearOperator: Nystrom approximation to kernel
+        """
+        K_nq = self._pairwise_kernels(x, self.components_, dense=False)  # (N, Q)
+
+        K_qn = K_nq.T
+        K_nq.backend = "GPU_2D"
+        K_qn = aslinearoperator(K_qn)
+        K_nq = aslinearoperator(K_nq)
+
+        K_q_inv = self.normalization.T @ self.normalization  # (Q,Q)
+        K_q_inv = aslinearoperator(K_q_inv)
+
+        K_approx = K_nq @ K_q_inv @ K_qn  # (N,Q), (Q,Q), (Q,N)
+
+        return K_approx  # (N, N)
diff --git a/pykeops/numpy/utils.py b/pykeops/numpy/utils.py
index d46e2f7f7..05efe2c98 100644
--- a/pykeops/numpy/utils.py
+++ b/pykeops/numpy/utils.py
@@ -2,6 +2,14 @@
 
 from pykeops.numpy import Genred, default_dtype, KernelSolve
 from pykeops.numpy.cluster import swap_axes as np_swap_axes
+from pykeops.numpy.cluster.grid_cluster import grid_cluster as np_grid_cluster
+from pykeops.numpy.cluster.matrix import from_matrix as np_from_matrix
+from pykeops.numpy.cluster import (
+    cluster_ranges_centroids as np_cluster_ranges_centroids,
+)
+from pykeops.numpy.cluster import cluster_ranges as np_cluster_ranges
+from pykeops.numpy.cluster import sort_clusters as np_sort_clusters
+
 import pykeops.config
 
 
@@ -10,8 +18,14 @@ class numpytools:
     arraysum = np.sum
     exp = np.exp
     log = np.log
+    sqrt = np.sqrt
     Genred = Genred
     KernelSolve = KernelSolve
+    grid_cluster = np_grid_cluster
+    from_matrix = np_from_matrix
+    cluster_ranges_centroids = np_cluster_ranges_centroids
+    cluster_ranges = np_cluster_ranges
+    sort_clusters = np_sort_clusters
     swap_axes = np_swap_axes
     arraytype = np.ndarray
     float_types = [float, np.float16, np.float32, np.float64]
@@ -104,7 +118,7 @@ def randn(m, n, dtype=default_dtype):
         return np.random.randn(m, n).astype(dtype)
 
     @staticmethod
-    def zeros(shape, dtype=default_dtype):
+    def zeros(shape, dtype=default_dtype, device=None):
         return np.zeros(shape).astype(dtype)
 
     @staticmethod
@@ -119,6 +133,108 @@ def array(x, dtype=default_dtype, device=None):
     def device(x):
         return "cpu"
 
+    @staticmethod
+    def distance_function(metric):
+        def euclidean(x, y):
+            return ((x - y) ** 2).sum(-1)
+
+        def manhattan(x, y):
+            return ((x - y).abs()).sum(-1)
+
+        def angular(x, y):
+            return -(x | y)
+
+        def angular_full(x, y):
+            return angular(x, y) / ((angular(x, x) * angular(y, y)).sqrt())
+
+        if metric == "euclidean":
+            return euclidean
+        elif metric == "manhattan":
+            return manhattan
+        elif metric == "angular":
+            return angular
+        elif metric == "angular_full":
+            return angular_full
+        elif metric == "hyperbolic":
+            raise ValueError(
+                "Hyperbolic not supported for IVF numpy, please use IVF torch instead"
+            )
+        else:
+            raise ValueError(
+                f"Unknown metric: {metric}. Supported values are euclidean, manhattan and angular"
+            )
+
+    @staticmethod
+    def sort(x):
+        perm = np.argsort(x)
+        return x[perm], perm
+
+    @staticmethod
+    def unsqueeze(x, n):
+        return np.expand_dims(x, n)
+
+    @staticmethod
+    def arange(n, device="cpu"):
+        return np.arange(n)
+
+    @staticmethod
+    def repeat(x, n):
+        return np.repeat(x, n)
+
+    @staticmethod
+    def to(x, device):
+        return x
+
+    @staticmethod
+    def index_select(input, dim, index):
+        return np.take(input, index, axis=dim)
+
+    @staticmethod
+    def kmeans(x, distance=None, K=10, Niter=15, device="CPU", approx=False, n=0):
+        # default metric is euclidean
+        if distance is None:
+            distance = numpytools.distance_function("euclidean")
+        if approx:
+            raise ValueError("Approx not supported on numpy version")
+        from pykeops.numpy import LazyTensor
+
+        N, D = x.shape
+        c = np.copy(x[:K, :])
+        x_i = LazyTensor(x[:, None, :])
+        for i in range(Niter):
+            c_j = LazyTensor(c[None, :, :])
+            D_ij = distance(x_i, c_j)
+            D_ij.backend = device
+            cl = D_ij.argmin(axis=1).astype(int).reshape(N)  # cluster assignment
+            # cluster location update
+            Ncl = np.bincount(cl).astype(dtype="float32")
+            for d in range(D):
+                c[:, d] = np.bincount(cl, weights=x[:, d]) / Ncl
+        return cl, c
+
+    @staticmethod
+    def knn_accuracy(indices_test, indices_truth):
+        """
+        Compares the test and ground truth indices (rows = KNN for each point in dataset)
+        Returns the accuracy or proportion of correct nearest neighbours
+
+        Args:
+          indices_test ((N, K) array): K indices obtained from approximate NN search
+          indices_truth ((N, K) array): K indices obtained from exact NN search
+        """
+        N, k = indices_test.shape
+
+        # Calculate number of correct nearest neighbours
+        accuracy = 0
+        for i in range(k):
+            accuracy += float(np.sum(indices_test == indices_truth)) / N
+            indices_truth = np.roll(
+                indices_truth, 1, -1
+            )  # Create a rolling window (index positions may not match)
+        accuracy = float(accuracy / k)  # percentage accuracy
+
+        return accuracy
+
 
 def squared_distances(x, y):
     x_norm = (x ** 2).sum(1).reshape(-1, 1)
diff --git a/pykeops/test/unit_tests_numpy.py b/pykeops/test/unit_tests_numpy.py
index 601ad9ae0..249c17737 100644
--- a/pykeops/test/unit_tests_numpy.py
+++ b/pykeops/test/unit_tests_numpy.py
@@ -437,6 +437,120 @@ def test_LazyTensor_sum(self):
             self.assertTrue(res_keops.shape == res_numpy.shape)
             self.assertTrue(np.allclose(res_keops, res_numpy, atol=1e-3))
 
+    ############################################################
+    def test_IVF(self):
+        ###########################################################
+        from pykeops.numpy import IVF
+        import numpy as np
+
+        np.random.seed(0)
+        N, D, clusters, k, a = 10 ** 3, 3, 10, 5, 5
+
+        # Generate random datapoints x, y
+        x = np.random.normal(size=(N, D))
+        y = np.random.normal(size=(N, D))
+
+        x2 = np.expand_dims(x, 0)
+        y2 = np.expand_dims(y, 1)
+
+        # Metrics (hyperbolic metric not implemented for numpy)
+        metrics = ["euclidean", "manhattan", "angular", "angular_full"]
+
+        for metric in metrics:
+            # Inputs to IVF algorithm
+            normalise = False
+            approx = False
+
+            # Brute force distance calculation
+            if metric == "euclidean":
+                distance = ((y2 - x2) ** 2).sum(-1)
+            elif metric == "manhattan":
+                distance = np.abs(y2 - x2).sum(-1)
+            elif metric in {"angular", "angular_full"}:
+                x3 = x / np.linalg.norm(x, axis=1, keepdims=True)
+                y3 = y / np.linalg.norm(y, axis=1, keepdims=True)
+                distance = -y3 @ (x3.T)
+                if metric == "angular":
+                    # Need to normalize data for angular metric
+                    normalise = True
+            elif metric == "hyperbolic":
+                # Placeholder in case hyperbolic metric is implemented in future
+                # Need to ensure first dimension is positive for hyperbolic metric
+                x += 5
+                y += 5
+                approx = True
+                distance = ((y2 - x2) ** 2).sum(-1) / (
+                    np.expand_dims(x[:, 0], 0) * np.expand_dims(y[:, 0], 1)
+                )
+
+            # Ground truth K nearest neighbours
+            truth = np.argsort(distance, axis=1)
+            truth = truth[:, :k]
+
+            # IVF K nearest neighbours
+            test = IVF(metric=metric, k=k, normalise=normalise)
+            test.fit(x, a=a, approx=approx, clusters=clusters)
+            ivf_fit = test.kneighbors(y)
+
+            # Calculate accuracy
+            accuracy = 0
+            for i in range(k):
+                accuracy += float(np.sum(ivf_fit == truth)) / N
+                truth = np.roll(
+                    truth, 1, -1
+                )  # Create a rolling window (index positions may not match)
+
+            accuracy = float(accuracy / k)
+
+            self.assertTrue(accuracy >= 0.8, f"Failed at {a}, {accuracy}")
+
+    ############################################################
+    def test_Nystrom_k_approx(self):
+        ############################################################
+        from pykeops.numpy import Nystrom
+
+        length = 100
+        num_sampling = 20
+        x = np.random.randint(1, 10, (100, 3)).astype(np.float32)
+
+        kernels = ["rbf", "exp"]
+
+        for kernel in kernels:
+            # calculate the ground truth kernel
+            N_truth = Nystrom(n_components=length, kernel=kernel, random_state=0).fit(x)
+            x_truth = N_truth.transform(x)
+            K = x_truth @ x_truth.T
+
+            # calculate an approximation
+            N_NK = Nystrom(
+                n_components=num_sampling, kernel=kernel, random_state=0
+            ).fit(x)
+            x_new = N_NK.transform(x)
+            K_approx = x_new @ x_new.T
+
+            error = np.linalg.norm(K - K_approx) / K.size
+
+            self.assertTrue(error < 0.01)
+
+    ############################################################
+    def test_Nystrom_k_shape(self):
+        ############################################################
+        from pykeops.numpy import Nystrom
+
+        length = 100
+        num_sampling = 20
+        x = np.random.randint(1, 10, (length, 3)).astype(np.float32)
+
+        kernels = ["rbf", "exp"]
+
+        for kernel in kernels:
+            N_NK = Nystrom(
+                n_components=num_sampling, kernel=kernel, random_state=0
+            ).fit(x)
+
+            self.assertTrue(N_NK.normalization_.shape == (num_sampling, num_sampling))
+            self.assertTrue(N_NK.transform(x).shape == (length, num_sampling))
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/pykeops/test/unit_tests_pytorch.py b/pykeops/test/unit_tests_pytorch.py
index a3bc09af0..341ecd938 100644
--- a/pykeops/test/unit_tests_pytorch.py
+++ b/pykeops/test/unit_tests_pytorch.py
@@ -673,6 +673,132 @@ def invert_permutation_numpy(permutation):
             torch.allclose(grad_keops.flatten(), grad_torch.flatten(), rtol=1e-4)
         )
 
+    ############################################################
+    def test_IVF(self):
+        ############################################################
+        from pykeops.torch import IVF
+        import torch
+
+        torch.manual_seed(0)
+        N, D, clusters, k, a = 10 ** 3, 3, 10, 5, 5
+
+        # Generate random datapoints x, y
+        x = torch.randn(N, D)
+        y = torch.randn(N, D)
+
+        x2 = x.unsqueeze(0)
+        y2 = y.unsqueeze(1)
+
+        # Metrics
+        metrics = ["euclidean", "manhattan", "angular", "angular_full", "hyperbolic"]
+
+        for metric in metrics:
+            # Inputs to IVF algorithm
+            normalise = False
+            approx = False
+
+            # Brute force distance calculation
+            if metric == "euclidean":
+                distance = ((y2 - x2) ** 2).sum(-1)
+            elif metric == "manhattan":
+                distance = ((y2 - x2).abs()).sum(-1)
+            elif metric in {"angular", "angular_full"}:
+                # Calculate normalised dot product (angular distances)
+                distance = (
+                    -y
+                    @ (x.T)
+                    / (
+                        (
+                            (x @ (x.T)).diag().unsqueeze(0)
+                            * (y @ (y.T)).diag().unsqueeze(1)
+                        ).sqrt()
+                    )
+                )
+                if metric == "angular":
+                    # Need to normalize data for angular metric
+                    normalise = True
+            elif metric == "hyperbolic":
+                # Need to ensure first dimension is positive for hyperbolic metric
+                x += 5
+                y += 5
+                approx = True
+                distance = ((y2 - x2) ** 2).sum(-1) / (
+                    x[:, 0].unsqueeze(0) * y[:, 0].unsqueeze(1)
+                )
+
+            # Ground truth K nearest neighbours
+            truth = torch.argsort(distance, dim=1)
+            truth = truth[:, :k]
+
+            # IVF K nearest neighbours
+            test = IVF(metric=metric, k=k, normalise=normalise)
+            test.fit(x, a=a, approx=approx, clusters=clusters)
+            ivf_fit = test.kneighbors(y)
+
+            # Calculate accuracy
+            accuracy = 0
+            for i in range(k):
+                accuracy += torch.sum(ivf_fit == truth).float() / N
+                truth = torch.roll(
+                    truth, 1, -1
+                )  # Create a rolling window (index positions may not match)
+
+            accuracy = float(accuracy / k)
+
+            self.assertTrue(accuracy >= 0.8, f"Failed at {a}, {accuracy}")
+
+    ############################################################
+    def test_Nystrom_K_approx(self):
+        ############################################################
+
+        from pykeops.torch import Nystrom
+        import torch
+
+        length = 100
+        num_sampling = 40
+        x = torch.rand((length, 3), dtype=torch.float32) * 10
+
+        kernels = ["rbf", "exp"]
+
+        for kernel in kernels:
+            # calculate the ground truth
+            N_truth = Nystrom(n_components=length, kernel=kernel, random_state=0).fit(x)
+            x_truth = N_truth.transform(x)
+            K = x_truth @ x_truth.T
+
+            # calculate an approximation
+            N_TK = Nystrom(
+                n_components=num_sampling, kernel=kernel, random_state=0
+            ).fit(x)
+
+            x_new = N_TK.transform(x)
+            K_approx = x_new @ x_new.T
+
+            error = torch.linalg.norm(K - K_approx) / (K.shape[0] * K.shape[1])
+
+            self.assertTrue(error < 0.01)
+
+    ############################################################
+    def test_Nystrom_K_shape(self):
+        ############################################################
+
+        from pykeops.torch import Nystrom
+        import torch
+
+        length = 100
+        num_sampling = 40
+        x = torch.rand(length, 3) * 100
+
+        kernels = ["rbf", "exp"]
+
+        for kernel in kernels:
+            N_NT = Nystrom(
+                n_components=num_sampling, kernel=kernel, random_state=0
+            ).fit(x)
+
+            self.assertTrue(N_NT.normalization_.shape == (num_sampling, num_sampling))
+            self.assertTrue(N_NT.transform(x).shape == (length, num_sampling))
+
 
 if __name__ == "__main__":
     """
diff --git a/pykeops/torch/__init__.py b/pykeops/torch/__init__.py
index 7754567cb..6262857e2 100644
--- a/pykeops/torch/__init__.py
+++ b/pykeops/torch/__init__.py
@@ -27,6 +27,8 @@
 ##########################################################
 # Import pyKeOps routines
 
+from .knn.ivf import IVF
+from .nystrom.nystrom import Nystrom
 from .generic.generic_red import Genred
 from .generic.generic_ops import (
     generic_sum,
@@ -39,6 +41,8 @@
 
 __all__ = sorted(
     [
+        "IVF",
+        "Nystrom",
         "Genred",
         "generic_sum",
         "generic_logsumexp",
diff --git a/pykeops/torch/knn/NNDescent.py b/pykeops/torch/knn/NNDescent.py
new file mode 100644
index 000000000..19e29ad25
--- /dev/null
+++ b/pykeops/torch/knn/NNDescent.py
@@ -0,0 +1,643 @@
+import torch
+import time
+from pykeops.torch import LazyTensor
+from pykeops.torch.cluster import cluster_ranges_centroids, from_matrix, sort_clusters
+from pykeops.torch.utils import torchtools
+
+
+class NNDescent:
+    def __init__(
+        self,
+        data=None,
+        k=5,
+        metric="euclidean",
+        initialization_method="forest",
+        num_trees=5,
+        leaf_multiplier=128,
+        big_leaf_depth=5,
+        verbose=False,
+        backend="torch",
+    ):
+        """Initialize the NNDescent class.
+
+        Initializes the NNDescent class given all relevant parameters. If data is
+        provided, it fits the NNDescent search graph to the data.
+        NNDescent is an approximation strategy for k-Nearest Neighbor search. It
+        constructs a k-NN graph on the dataset, which is then navigated with a
+        graph-based search algorithm during query time.
+
+        The original paper on the method: https://www.cs.princeton.edu/cass/papers/www11.pdf
+        Our code was inspired by the PyNNDescent documentation: https://pynndescent.readthedocs.io/en/latest/how_pynndescent_works.html
+
+        Args:
+            data ((N,D) Tensor): Dataset of N datapoints of dimensionality D.
+            k (int): The number of nearest neighbors which we want to find for each query point
+            metric (string): Name of metric, either "euclidean" and "manhattan"
+            initialization_method (string): The type of initialization to be used for
+                the search graph. Can be "random", "random_big", "forest" or "cluster".
+            num_trees (int): Number of trees used in "random_big" or "forest" initializations.
+            big_leaf_depth (int): The depth at which the big leaves are taken to be used at
+                the start of search.
+            verbose (boolean): Determines whether or not to print information while fitting.
+            backend (string): Either "torch" or "keops". Determines if we want to use LazyTensors in cluster initialization.
+
+        Args not used when initialization_method = "cluster":
+            leaf_multiplier (int): Parameter for the Tree class for tree-based initializations.
+                when initialization_method = "cluster", this parameter is used to adjust the number
+                of clusters to be close to the value specified in the fit function.
+        """
+
+        # Setting parameters
+        self.k = k
+        self.metric = metric
+        self.init_method = initialization_method
+        self.num_trees = num_trees
+        self.leaf_multiplier = leaf_multiplier
+        self.big_leaf_depth = big_leaf_depth
+        self.big_leaves = None
+        self.backend = backend
+
+        # Distance function
+        self.distance = torchtools.distance_function(metric)
+
+        # If data is provided, we call the fit function.
+        if data is not None:
+            self.fit(data, verbose=verbose)
+
+    def fit(self, X, iter=20, verbose=False, clusters=32, a=10, queue=5):
+        """Fits the NNDescent search graph to the data set X.
+
+        Args:
+            X ((N,D) Tensor): Dataset of N datapoints of dimensionality D.
+            iter (int): Maximum number of iterations for graph updates
+            verbose (boolean): Determines whether or not to print information while fitting.
+            queue (int): The number of neighbors to which each node connects in the search graph.
+
+        Used only when initialization_method = "cluster":
+            clusters (int): The min no. of clusters that we want the data to be clustered into
+            a (int): The number of clusters we want to search over using the cluster method.
+
+        """
+        self.data = X
+        self.device = X.device
+        self.queue = queue
+
+        if queue < self.k and self.init_method is not "cluster":
+            self.queue = self.k
+            print(
+                "Warning: Value of queue must be larger than or equal to k! Set queue = k."
+            )
+        elif queue > a and self.init_method is "cluster":
+            raise ValueError("Value of queue must be smaller than value of a!")
+        elif clusters < 2 ** self.big_leaf_depth:
+            # This is neccesary to use the more efficient initial points in the graph search.
+            raise ValueError("Minimum number of clusters is 2^big_leaf_depth!")
+        elif a > clusters:
+            raise ValueError("Number of clusters must be larger than or equal to a!")
+        elif X.is_cuda and self.init_method is not "cluster":
+            raise ValueError("CUDA not supported for non-cluster version of NNDescent.")
+
+        # A 2D tensor representing a directed graph.
+        # The value a = graph[i,j] represents an edge from point x_i to x_a.
+        N = X.shape[0]
+        self.graph = torch.zeros(size=[N, self.queue], dtype=torch.long)
+
+        # Initialize graph
+        if self.init_method == "random":
+            self._initialize_graph_randomly()
+        elif self.init_method == "random_big":
+            self._initialize_graph_big_random(self.data, self.num_trees)
+        elif self.init_method == "forest":
+            self._initialize_graph_forest(
+                self.data, self.num_trees, self.leaf_multiplier, self.big_leaf_depth
+            )
+        elif self.init_method == "cluster":
+            # Parameters used only for cluster search
+            self.a = a
+            self.num_clusters = clusters
+            self._initialize_graph_clusters(self.data)
+
+        # A set of tuples (i,j) of indices for which the distance has already been calculated.
+        self.explored_edges = set()
+
+        if self.init_method != "cluster":
+            # A 2D tensor representing the distance between point x_i and x_graph[i,j]
+            self.k_distances = torch.zeros([N, self.queue])
+
+            # Update the graph
+            self._calculate_all_distances()
+            self._update_graph(iter=iter, verbose=verbose)
+
+    def _update_graph(self, iter, verbose=False):
+        """Updates the current estimate for the kNN-graph with the iterative NN-Descent algorithm
+
+        See https://pynndescent.readthedocs.io/en/latest/how_pynndescent_works.html for detailed explanation.
+
+        Args:
+            iter (int): Number of iterations to use when updating search graph.
+            verbose (boolean): Printing information about iterations while searching.
+        """
+        # [STEP 1: Start with random graph.] Iterate
+        start = time.time()
+        for it in range(iter):
+            if verbose:
+                print(
+                    f"Iteration number {it} with average distance of {torch.mean(self.k_distances).item()}. Took {time.time() - start} seconds."
+                )
+            has_changed = False
+
+            # [STEP 2: For each node:] (TODO: Investigate whether this can be vectorized.)
+            for i, neighbors in enumerate(self.graph):
+                # Distances of current neighbors
+                dist_current_neighbors = self.k_distances[i]
+
+                # [STEP 3: Measure distance from the node to the neighbors of its neighbors]
+                # Find neighbors of neighbors
+                potential_neighbors = {
+                    a.item()
+                    for a in self.graph[neighbors].flatten()
+                    if a not in neighbors
+                    and a != i
+                    and (i, int(a)) not in self.explored_edges
+                }
+                potential_distances = torch.Tensor(
+                    [
+                        self.distance(self.data[i], self.data[n])
+                        for n in potential_neighbors
+                    ]
+                )
+                self.explored_edges.update([(i, int(r)) for r in potential_neighbors])
+
+                # Concatenate potential neighbors to list of neighbors (indices and distances)
+                cat_idx = torch.cat(
+                    [neighbors, torch.Tensor(list(potential_neighbors))]
+                )
+                cat_dist = torch.cat([self.k_distances[i], potential_distances])
+
+                # [STEP 4: If any are closer, then update the graph accordingly, and only keep the k closest]
+                dist_sorted, idx = torch.sort(cat_dist)
+                if torch.max(idx[: self.queue]) >= self.queue:
+                    has_changed = True
+                    self.graph[i] = cat_idx[idx[: self.queue]]
+                    self.k_distances[i] = dist_sorted[: self.queue]
+
+            # [STEP 5: If any changes were made, repeat iteration, otherwise stop]
+            if not has_changed:
+                if verbose:
+                    print(f"Fitting complete! Took {it} iterations.")
+                break
+
+    def kneighbors(self, X, max_num_steps=100, tree_init=True, verbose=False):
+        """Returns k nearest neighbors of input X using NNDescent.
+
+        Our code is largely based on this algorithm:
+          https://pynndescent.readthedocs.io/en/latest/how_pynndescent_works.html#Searching-using-a-nearest-neighbor-graph
+
+        If init_method = 'clusters', we first cluster the data. Each node in the graph then represents a cluster.
+        We then use the KeOps engine to perform the final nearest neighbours search over the nearest clusters to each query point
+
+        Args:
+            X ((N,D) Tensor): A query set for which to find k neighbors.
+            K (int): How many neighbors to search for. Must be <=self.k for non-cluster methods. Default: self.k
+            max_num_steps (int): The maximum number of steps to take during search.
+            tree_init (boolean): Determine whether or not to use big leaves from projection trees as the starting point of search.
+            verbose (boolean): Printing information about iterations while searching.
+
+        Returns:
+            The indices of the k nearest neighbors in the fitted data.
+        """
+
+        # N datapoints of dimension d
+        N, d = X.shape
+        k = self.queue
+
+        # Boolean mask to keep track of those points whose search is still ongoing
+        is_active = torch.ones(N) == 1
+
+        # If graph was initialized using trees, we can use information from there to initialize in a diversed manner.
+        if self.big_leaves is not None and tree_init:
+            candidate_idx = self.big_leaves.unsqueeze(0).repeat(
+                N, 1
+            )  # Shape: (N,2**self.big_leaf_depth)
+        else:
+            # Random initialization for starting points of search.
+            candidate_idx = torch.randint(
+                high=len(self.data), size=[N, k + 1], dtype=torch.long
+            )
+
+        if self.init_method == "cluster":
+            is_active = is_active.to(self.device)
+            candidate_idx = candidate_idx.to(self.device)
+
+        # Sort the candidates by distance from X
+        distances = self.distance(self.data[candidate_idx], X.unsqueeze(1))
+        # distances = ((self.data[candidate_idx] - X.unsqueeze(1))**2).sum(-1)
+        sorted, idx = torch.sort(distances, dim=1)
+        candidate_idx = torch.gather(candidate_idx, dim=1, index=idx)
+        # Truncate to k+1 nearest
+        candidate_idx = candidate_idx[:, : (k + 1)]
+
+        # Track the nodes we have explored already, in N x num_explored tensor
+        num_explored = k * 2
+        explored = torch.full(size=[N, num_explored], fill_value=-1)
+
+        if self.init_method == "cluster":
+            explored = explored.to(self.device)
+
+        start = time.time()
+        # The initialization of candidates and explored set is done. Now we can search.
+        count = 0
+        while count < max_num_steps:
+            if verbose:
+                print(
+                    f"Step {count} - Search is completed for {1 - torch.mean(1.0 * is_active).item()} - this step took {time.time() - start} s"
+                )
+            start = time.time()
+
+            # [2. Look at nodes connected by an edge to the best untried node in graph]
+            # diff_bool.shape is (M, k+1, num_explored), where M is the number of active searches
+            diff_bool = (
+                candidate_idx[is_active].unsqueeze(2) - explored[is_active].unsqueeze(1)
+                == 0
+            )
+            in_explored = torch.any(diff_bool, dim=2)
+            # batch_active is true for those who haven't been fully explored in the current batch
+            batch_active = ~torch.all(in_explored[:, :-1], dim=1)
+
+            # Update is_active mask. If none are active, break search
+            is_active[is_active.clone()] = batch_active
+            if not is_active.any():
+                break
+
+            # first_unexplored has indices of first unexplored element per row
+            first_unexplored = torch.max(~in_explored[batch_active], dim=1)[
+                1
+            ].unsqueeze(1)
+            # Unexplored nodes to be expanded
+            unexplored_idx = torch.gather(
+                candidate_idx[is_active], dim=1, index=first_unexplored
+            ).squeeze(-1)
+            explored[is_active, (count % num_explored)] = unexplored_idx
+
+            # [3. Add all these nodes to our potential candidate pool]
+            # Add neighbors of the first unexplored point to the list of candidates
+            expanded_idx = torch.cat(
+                (self.graph[unexplored_idx], candidate_idx[is_active]), dim=1
+            )
+
+            # We remove repeated indices from consideration by adding float('inf') to them.
+            expanded_idx = torch.sort(expanded_idx)[0]
+            temp = torch.full((len(expanded_idx), 1), -1)
+
+            if self.init_method == "cluster":
+                expanded_idx = expanded_idx.to(self.device)
+                temp = temp.to(self.device)
+
+            shift = torch.cat(
+                (
+                    temp,
+                    torch.sort(expanded_idx, dim=1)[0][:, :-1],
+                ),
+                dim=1,
+            )
+            unwanted_indices = expanded_idx == shift
+
+            # [4. Sort by closeness].
+            distances = self.distance(
+                self.data[expanded_idx], X[is_active].unsqueeze(1)
+            )
+            # distances = ((self.data[expanded_idx] - X[is_active].unsqueeze(1))**2).sum(-1)
+            distances[unwanted_indices] += float("inf")
+            sorted, idx = torch.sort(distances, dim=1)
+            expanded_idx = torch.gather(expanded_idx, dim=1, index=idx)
+
+            # [5. Truncate to k+1 best]
+            candidate_idx[is_active] = expanded_idx[:, : (k + 1)]
+
+            # [6. Return to step 2. If we have already tried all candidates in pool, we stop in the if not unexplored]
+            count += 1
+
+        # Return the k candidates
+        if verbose:
+            print(
+                f"Graph search finished after {count} steps. Finished for: {(1 - torch.mean(1.0 * is_active).item()) * 100}%."
+            )
+
+        if self.init_method == "cluster":
+            return self.final_brute_force(
+                candidate_idx[:, : self.k], X, verbose=verbose
+            )
+        else:
+            return candidate_idx[:, : self.k]
+
+    def _calculate_all_distances(self):
+        """Updates the distances (self.k_distances) of the edges found in self.graph."""
+        # Uses loop for simplicity.
+        for i, row in enumerate(self.graph):
+            # Indices of current k neighbors in self.graph
+            neighbor_indices = [(i, int(r)) for r in row]
+            # The distances of those neighbors are saved in k_distances
+            self.k_distances[i] = torch.Tensor(
+                [self.distance(self.data[a], self.data[b]) for a, b in neighbor_indices]
+            )
+            # Add pairs to explored_edges set
+            self.explored_edges.update(neighbor_indices)
+
+    def _initialize_graph_randomly(self):
+        """Initializes self.graph with random values such that each point has 'queue' distinct neighbors"""
+        N, k = self.graph.shape
+        # Initialize graph randomly, removing self-loops
+        self.graph = torch.randint(high=N - 1, size=[N, k], dtype=torch.long)
+        row_indices = torch.arange(N).unsqueeze(1).repeat(1, k)
+        self.graph[self.graph >= row_indices] += 1
+
+    def _initialize_graph_big_random(self, data, numtrees):
+        """Initializes self.graph randomly, but with more neighbours at the start"""
+        N, k = self.graph.shape
+        temp_graph = torch.tensor([])
+
+        # make 'trees', combine into giant graph with each element (row) having k * num_trees neighbours
+        # this is a small for loop - numtrees and k << datapoints
+        for j in range(numtrees):
+            tree_graph = torch.tensor([])
+            for i in range(k):
+                tree_graph = torch.cat(
+                    (tree_graph, torch.randperm(N)), 0
+                )  # generate randomly shuffled list of N indices
+            tree_graph = tree_graph.reshape(
+                -1, k
+            )  # creates a N x k tensor with N indices, each appearing k times. This represents 1 'tree'
+            temp_graph = torch.cat(
+                (temp_graph, tree_graph), 1
+            )  # combine into giant N x (k*num_trees) tensor. This represents the forest
+
+        # find KNN for each row in giant graph
+        # TODO - implement the below without a for loop
+        for i, row in enumerate(temp_graph):
+            temp_row = torch.unique(row).type(torch.LongTensor)  # remove duplicates
+            temp_row = temp_row[temp_row != i]  # remove self
+
+            temp_points = data[temp_row, :]  # pick out elements from dataset
+            distances = self.distance(temp_points, data[i])  # Euclidean distances
+            indices = distances.topk(
+                k=self.queue, largest=False
+            ).indices  # find indices of KNN
+            self.graph[i] = temp_row[indices]  # assign KNN to graph
+
+    def _initialize_graph_forest(self, data, numtrees, leaf_multiplier, big_leaf_depth):
+        """Initializes self.graph with a forest of random trees, such that each point has 'queue' distinct neighbors"""
+        N, k = self.graph.shape
+        dim = data.shape[1]
+
+        temp_graph = torch.tensor(())
+        for j in range(numtrees):
+            # Create trees, obtain leaves. RandomProjectionTree class is defined below.
+            t = RandomProjectionTree(
+                data, k=k * leaf_multiplier, big_leaf_depth=big_leaf_depth
+            )
+
+            # Create temporary graph, 1 for each tree
+            # Leaves are of uneven size; select smallest leaf size as graph size
+            cols = min([len(leaf) for leaf in t.leaves])
+            rows = len(t.leaves)
+            tree_graph = torch.zeros((N, cols))
+            leaves = torch.tensor(())
+            idx_update = torch.tensor(())
+
+            # Update graph using leaves
+            for leaf in t.leaves:
+                temp_idx = torch.as_strided(
+                    torch.tensor(leaf).repeat(1, 2),
+                    size=[len(leaf), cols],
+                    stride=[1, 1],
+                    storage_offset=1,
+                )
+                tree_graph[
+                    leaf, :
+                ] = temp_idx.float()  # update graph. a lot of overwriting
+            # Concatenate all graphs from all trees into 1 giant graph
+            temp_graph = torch.cat((temp_graph, tree_graph), 1)
+
+            # Add the first tree's big_leaves to the NNDescent's big_leaves
+            if j == 0 and t.big_leaves:
+                self.big_leaves = torch.LongTensor(t.big_leaves)
+
+        warning_count = 0  # number of indices for which some neighbours are random
+
+        # find KNN for each row in giant graph
+        # TODO - implement the below without a for loop
+        for i, row in enumerate(temp_graph):
+            temp_row = torch.unique(row).type(torch.LongTensor)  # remove duplicates
+            temp_row = temp_row[temp_row != i]  # remove self
+
+            temp_points = data[temp_row, :]  # pick out elements from dataset
+            d = self.distance(
+                data[i].reshape(1, dim).unsqueeze(1), temp_points.unsqueeze(0)
+            )
+            distances, indices = torch.sort(d, dim=1)
+            indices = indices.flatten()[:k]
+
+            indices = temp_row[indices]
+
+            # pad with random indices if there are not enough neighbours
+            warning = False  # warning flag
+            while len(indices) < k:
+                pad = torch.randint(
+                    high=N - 1,
+                    size=[
+                        k - len(indices),
+                    ],
+                    dtype=torch.long,
+                )
+                indices = torch.cat((indices, pad))
+                indices = torch.unique(indices).type(
+                    torch.LongTensor
+                )  # remove duplicates
+                indices = indices[indices != i]  # remove self
+                warning = True
+
+            self.graph[i] = indices  # assign KNN to graph
+
+            if warning:
+                warning_count += 1
+
+        if warning_count:
+            print(f"WARNING! {warning_count} INDICES ARE RANDOM!")
+
+    def _initialize_graph_clusters(self, data):
+        """Initializes self.graph on cluster centroids, such that each cluster has 'a' distinct neighbors"""
+        N, dim = data.shape
+        k = self.k
+        a = self.a
+        backend = self.backend
+        leaf_multiplier = (
+            N / self.num_clusters / k
+        )  # to get number of clusters ~ num_clusters
+        self.clusters = (
+            torch.ones(
+                N,
+            )
+            * -1
+        )
+
+        data = data.to(self.device)
+
+        # Create trees, obtain leaves. RandomProjectionTree class is defined below.
+        t = RandomProjectionTree(data, k, self.big_leaf_depth, leaf_multiplier, backend)
+
+        self.leaves = len(t.leaves)
+
+        # Assign each point to a cluster, 1 cluster per tree in forest
+        for i, leaf in enumerate(t.leaves):
+            self.clusters[leaf] = i
+        self.data_orig = self.data.clone()
+        self.data = t.centroids.clone()
+
+        # Find nearest centroids
+        x_LT = LazyTensor(self.data.unsqueeze(1).to(self.device))
+        y_LT = LazyTensor(self.data.unsqueeze(0).to(self.device))
+        d = self.distance(x_LT, y_LT)
+        indices = d.argKmin(K=a + 1, dim=1).long()
+        self.centroids_neighbours = indices[:, 1:].long()
+
+        self.num_clusters = self.centroids_neighbours.shape[0]
+        self.graph = self.centroids_neighbours
+
+        # Assign big_leaves by searching for the correct cluster
+        self.big_leaves = torch.LongTensor(t.big_leaves)
+        for i, index in enumerate(self.big_leaves):
+            self.big_leaves[i] = self.clusters[index]
+        return
+
+    def final_brute_force(self, nearest_clusters, query_pts, verbose=False):
+        """ Final brute force search over clusters in cluster method"""
+        if verbose:
+            print("Starting brute force search over clusters.")
+        return self._final_brute_force(nearest_clusters, query_pts)
+
+    def _final_brute_force(self, nearest_clusters, query_pts):
+        """ Final brute force search over clusters in cluster method"""
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+
+        k = self.k
+
+        x = self.data_orig.to(self.device)
+        x_labels = self.clusters.long()
+        y = query_pts.to(self.device)
+        y_labels = nearest_clusters[:, 0]
+
+        x = x.contiguous()
+        y = y.contiguous()
+        x_labels = x_labels.to(self.device)
+        y_labels = y_labels.to(self.device)
+
+        clusters, a = self.graph.shape
+        r = torch.arange(clusters).repeat(a, 1).T.reshape(-1).long()
+        keep = torch.zeros([clusters, clusters], dtype=torch.bool).to(self.device)
+        keep[r, self.graph.flatten()] = True
+        keep += torch.eye(clusters).bool().to(self.device)
+
+        x_ranges, x_centroids, _ = cluster_ranges_centroids(x, x_labels)
+        y_ranges, y_centroids, _ = cluster_ranges_centroids(y, y_labels)
+
+        x, x_labels = self.__sort_clusters(x, x_labels, store_x=True)
+        y, y_labels = self.__sort_clusters(y, y_labels, store_x=False)
+
+        x_LT = LazyTensor(x.unsqueeze(0).to(self.device).contiguous())
+        y_LT = LazyTensor(y.unsqueeze(1).to(self.device).contiguous())
+        D_ij = self.distance(y_LT, x_LT)
+
+        x_ranges = x_ranges.to(self.device)
+        y_ranges = y_ranges.to(self.device)
+        ranges_ij = from_matrix(y_ranges, x_ranges, keep)
+        D_ij.ranges = ranges_ij
+        nn = D_ij.argKmin(K=k, axis=1)
+        return self.__unsort(nn)
+
+    def __sort_clusters(self, x, lab, store_x=True):
+        lab, perm = torch.sort(lab.view(-1))
+        if store_x:
+            self.__x_perm = perm
+        else:
+            self.__y_perm = perm
+        return x[perm], lab
+
+    def __unsort(self, nn):
+        return torch.index_select(self.__x_perm[nn], 0, self.__y_perm.argsort())
+
+
+class RandomProjectionTree:
+    """
+    Random projection tree class that splits the data evenly per split
+    Each split is performed by calculating the projection distance of each datapoint to a random unit vector
+    The datapoints are then split by the median of of these projection distances
+    The indices of the datapoints are stored in tree.leaves, as a nested list
+    """
+
+    def __init__(
+        self,
+        x,
+        k=5,
+        big_leaf_depth=5,
+        leaf_multiplier=128,
+        backend="torch",
+        device=None,
+    ):
+        self.min_size = k * leaf_multiplier
+        self.leaves = []
+        self.sizes = []
+        if device is None:
+            self.device = x.device
+        else:
+            self.device = device
+        self.centroids = torch.tensor(()).to(self.device)
+        self.big_leaf_depth = big_leaf_depth
+        self.big_leaves = []  # leaves at depth = 5
+        indices = torch.arange(x.shape[0])
+
+        self.dim = x.shape[1]
+        self.data = x.to(self.device)
+        self.backend = backend  # Boolean to choose LT or torch initialization
+
+        self.tree = self.make_tree(indices, depth=0)
+        self.centroids = self.centroids.reshape(-1, x.shape[1])
+
+    def make_tree(self, indices, depth):
+        if depth == 5:  # add to big_leaves if depth=5
+            self.big_leaves.append(int(indices[0]))
+        if indices.shape[0] > self.min_size:
+            v = self.choose_rule().to(self.device)
+
+            if self.backend == "keops":
+                distances = self.dot_product(
+                    self.data[indices], v
+                )  # create list of projection distances
+            else:
+                distances = torch.tensordot(
+                    self.data[indices], v, dims=1
+                )  # create list of projection distances
+
+            median = torch.median(distances)
+            left_bool = (
+                distances <= median
+            )  # create boolean array where entries are true if distance <= median
+            self.make_tree(indices[left_bool], depth + 1)
+            self.make_tree(indices[~left_bool], depth + 1)
+        elif indices.shape[0] != 0:
+            self.leaves.append(indices.tolist())
+            self.sizes.append(indices.shape[0])
+            centroid = self.data[indices].mean(dim=0)  # get centroid position
+            self.centroids = torch.cat((self.centroids, centroid))
+        return
+
+    def choose_rule(self):
+        v = torch.rand(self.dim)  # create random vector
+        v /= torch.norm(v)  # normalize to unit vector
+        return v
+
+    def dot_product(self, x, v):
+        # Calculate dot product between matrix x and vector v using LazyTensors
+        v_LT = LazyTensor(v.view(1, 1, -1))
+        x_LT = LazyTensor(x.unsqueeze(0))
+        return (v_LT | x_LT).sum_reduction(axis=0).flatten()
diff --git a/pykeops/torch/knn/__init__.py b/pykeops/torch/knn/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/pykeops/torch/knn/ivf.py b/pykeops/torch/knn/ivf.py
new file mode 100644
index 000000000..d4b77450d
--- /dev/null
+++ b/pykeops/torch/knn/ivf.py
@@ -0,0 +1,70 @@
+from pykeops.common.ivf import GenericIVF
+import torch
+
+
+class IVF(GenericIVF):
+    """IVF-Flat is a KNN approximation algorithm.
+
+    The original FAISS paper can found at https://arxiv.org/abs/1702.08734
+    """
+
+    def __init__(self, k=5, metric="euclidean", normalise=False):
+        """Initialise the IVF-Flat class.
+
+        IVF-Flat is a KNN approximation algorithm.
+        It first clusters the input data.
+        During query time, it searches only within the closest clusters.
+
+        Args:
+          k (int): Number of nearest neighbours to obtain.
+          metric (str,function): Metric to use.
+            Currently, "euclidean", "manhattan", "angular" and "hyperbolic" are directly supported, apart from custom metrics.
+            Hyperbolic metric requires the use of approx = True, during the fit() function later.
+            Custom metrics should be in the form of a function with 2 inputs and returns their distance.
+            For more information, refer to the tutorial.
+          normalise (bool): Whether or not to normalise all input data to norm 1.
+            This is used mainly for angular metric.
+            In place of this, "angular_full" metric may be used instead.
+
+        """
+        from pykeops.torch import LazyTensor
+
+        self.__get_tools()
+        super().__init__(k=k, metric=metric, normalise=normalise, lazytensor=LazyTensor)
+
+    def __get_tools(self):
+        from pykeops.torch.utils import torchtools
+
+        self.tools = torchtools
+
+    def fit(self, x, clusters=50, a=5, Niter=15, approx=False, n=50):
+        """Fits a dataset to perform the nearest neighbour search over.
+
+        K-Means is performed on the dataset to obtain clusters.
+        Then the closest clusters to each cluster is stored for use during query time.
+
+        Args:
+          x ((N, D) array): Input dataset of N points in dimension D.
+            Where N is the number of points and D is the number of dimensions.
+          clusters (int): Total number of clusters to create in K-Means.
+          a (int): Number of clusters to search over, must be less than total number of clusters created.
+          Niter (int): Number of iterations to run in K-Means algorithm.
+          approx (bool): Whether or not to use an approximation step in K-Means.
+            In hyperbolic metric and custom metric, this should be set to True.
+            This is because the optimal cluster centroid may not have a simple closed form expression.
+          n (int): Number of iterations to optimise the cluster centroid, when approx = True.
+            A value of around 50 is recommended.
+            Lower values are faster while higher values give better accuracy in centroid location.
+
+        """
+        return self._fit(
+            x, clusters=clusters, a=a, Niter=Niter, device=x.device, approx=approx, n=n
+        )
+
+    def kneighbors(self, y):
+        """Obtains the nearest neighbors for an input dataset from the fitted dataset.
+
+        Args:
+          y ((M, D) array): Query dataset of M points in dimension D.
+        """
+        return self._kneighbors(y)
diff --git a/pykeops/torch/nystrom/__init__.py b/pykeops/torch/nystrom/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/pykeops/torch/nystrom/nystrom.py b/pykeops/torch/nystrom/nystrom.py
new file mode 100644
index 000000000..d75de024f
--- /dev/null
+++ b/pykeops/torch/nystrom/nystrom.py
@@ -0,0 +1,112 @@
+import torch
+
+from pykeops.common.nystrom_generic import GenericNystroem
+
+
+class Nystrom(GenericNystroem):
+    """
+    Nystroem class to work with Pytorch tensors.
+    """
+
+    def __init__(
+        self,
+        n_components=100,
+        kernel="rbf",
+        sigma: float = None,
+        inv_eps: float = None,
+        verbose=False,
+        random_state=None,
+    ):
+
+        """
+        Args:
+             n_components: int: how many samples to select from data.
+            kernel: str: type of kernel to use. Current options = {rbf:Gaussian,
+                exp: exponential}.
+            sigma: float: exponential constant for the RBF and exponential kernels.
+            inv_eps: float: additive invertibility constant for matrix decomposition.
+            verbose: boolean: set True to print details.
+            random_state: int: to set a random seed for the random sampling of the
+                samples. To be used when reproducibility is needed.
+        """
+
+        super().__init__(n_components, kernel, sigma, inv_eps, verbose, random_state)
+        from pykeops.torch.utils import torchtools
+        from pykeops.torch import LazyTensor
+
+        self.tools = torchtools
+        self.verbose = verbose
+        self.lazy_tensor = LazyTensor
+
+    def _decomposition_and_norm(self, basis_kernel) -> torch.tensor:
+        """
+        Function to return self.normalization used in fit(.) function
+
+        Args:
+          basis_kernel: torch.tensor: K_q smaller sampled kernel
+
+        Returns:
+          K_q^{-1/2}: torch.tensor
+        """
+
+        U, S, V = torch.linalg.svd(
+            basis_kernel, full_matrices=False
+        )  # (Q,Q), (Q,), (Q,Q)
+        S = torch.maximum(S, torch.ones(S.size()) * 1e-12)
+        return torch.mm(U / torch.sqrt(S), V)  # (Q,Q)
+
+    def _get_kernel(self, x: torch.tensor, y: torch.tensor, kernel=None):
+        """
+        Constructs dense kernel.
+
+        Returns:
+          K: torch.tensor: dense kernel
+
+        """
+        D_xx = (x * x).sum(-1).unsqueeze(1)  # (N,1)
+        D_xy = torch.matmul(x, y.permute(1, 0))  # (N,D) @ (D,M) = (N,M)
+        D_yy = (y * y).sum(-1).unsqueeze(0)  # (1,M)
+        D_xy = D_xx - 2 * D_xy + D_yy  # (N,M)
+        if kernel == "exp":
+            D_xy = torch.sqrt(D_xy)
+        return (-D_xy).exp()  # (N,M)
+
+    def _update_dtype(self):
+        "Overloading function to bypass in this subclass"
+        pass
+
+    def K_approx(self, X: torch.tensor) -> "K_approx operator":
+        """Function to return Nystrom approximation to the kernel.
+
+        Args:
+          X: torch.tensor: data used in fit(.) function.
+        Returns
+          K_approx: K_approx_operator: Nystrom approximation to kernel
+            which can be applied downstream as K_approx @ v for some 1d
+            tensor v
+        """
+
+        K_nq = self._pairwise_kernels(X, self.components_, dense=False)  # (N, Q)
+        K_approx = K_approx_operator(K_nq, self.normalization)  # (N, B), with v[N, B]
+        return K_approx
+
+
+class K_approx_operator:
+    """Helper class to return K_approx as an object
+    compatible with @ symbol
+    """
+
+    def __init__(self, K_nq, normalization):
+
+        self.K_nq = K_nq  # dim: number of samples x num_components
+        self.normalization = normalization
+
+    def __matmul__(self, v: torch.tensor) -> torch.tensor:
+
+        K_qn = self.K_nq.T
+        self.K_nq.backend = "GPU_2D"
+
+        x = K_qn @ v  # (Q,N), (N,B)
+        x = self.normalization @ self.normalization.T @ x  # (Q,Q), (Q,Q), (Q, B)
+        x = self.K_nq @ x  # (N,Q), (Q,B)
+        return x  # (N,B)
diff --git a/pykeops/torch/utils.py b/pykeops/torch/utils.py
index 89e71d888..f501bd429 100644
--- a/pykeops/torch/utils.py
+++ b/pykeops/torch/utils.py
@@ -2,7 +2,13 @@
 
 from pykeops.torch import Genred, KernelSolve, default_dtype
 from pykeops.torch.cluster import swap_axes as torch_swap_axes
-
+from pykeops.torch.cluster import grid_cluster as torch_grid_cluster
+from pykeops.torch.cluster import from_matrix as torch_from_matrix
+from pykeops.torch.cluster import (
+    cluster_ranges_centroids as torch_cluster_ranges_centroids,
+)
+from pykeops.torch.cluster import cluster_ranges as torch_cluster_ranges
+from pykeops.torch.cluster import sort_clusters as torch_sort_clusters
 
 # from pykeops.torch.generic.generic_red import GenredLowlevel
 
@@ -16,11 +22,20 @@ class torchtools:
     exp = torch.exp
     log = torch.log
     norm = torch.norm
+    sqrt = torch.sqrt
 
     swap_axes = torch_swap_axes
 
     Genred = Genred
     KernelSolve = KernelSolve
+    grid_cluster = torch_grid_cluster
+    from_matrix = torch_from_matrix
+    cluster_ranges_centroids = torch_cluster_ranges_centroids
+    cluster_ranges = torch_cluster_ranges
+    sort_clusters = torch_sort_clusters
+
+    arraytype = torch.Tensor
+    float_types = [float]
 
     arraytype = torch.Tensor
     float_types = [float]
@@ -157,6 +172,148 @@ def device(x):
         else:
             return None
 
+    @staticmethod
+    def distance_function(metric):
+        def euclidean(x, y):
+            return ((x - y) ** 2).sum(-1)
+
+        def manhattan(x, y):
+            return ((x - y).abs()).sum(-1)
+
+        def angular(x, y):
+            return -(x | y)
+
+        def angular_full(x, y):
+            return angular(x, y) / ((angular(x, x) * angular(y, y)).sqrt())
+
+        def hyperbolic(x, y):
+            return ((x - y) ** 2).sum(-1) / (x[0] * y[0])
+
+        if metric == "euclidean":
+            return euclidean
+        elif metric == "manhattan":
+            return manhattan
+        elif metric == "angular":
+            return angular
+        elif metric == "angular_full":
+            return angular_full
+        elif metric == "hyperbolic":
+            return hyperbolic
+        else:
+            raise ValueError("Unknown metric")
+
+    @staticmethod
+    def sort(x):
+        return torch.sort(x)
+
+    @staticmethod
+    def unsqueeze(x, n):
+        return torch.unsqueeze(x, n)
+
+    @staticmethod
+    def arange(n, device="cpu"):
+        return torch.arange(n, device=device)
+
+    @staticmethod
+    def repeat(x, n):
+        return torch.repeat_interleave(x, n)
+
+    @staticmethod
+    def to(x, device):
+        if isinstance(x, torch.Tensor):
+            return x.to(device)
+        return x
+
+    @staticmethod
+    def index_select(input, dim, index):
+        return torch.index_select(input, dim, index)
+
+    @staticmethod
+    def kmeans(x, distance=None, K=10, Niter=15, device="cuda", approx=False, n=10):
+
+        from pykeops.torch import LazyTensor
+
+        # default metric is euclidean
+        if distance is None:
+            distance = torchtools.distance_function("euclidean")
+
+        def calc_centroid(x, c, cl, n=10):
+            "Helper function to optimise centroid location"
+            c = torch.clone(c.detach()).to(device)
+            c.requires_grad = True
+            x1 = LazyTensor(x.unsqueeze(0))
+            op = torch.optim.Adam([c], lr=1 / n)
+            scaling = 1 / torch.gather(torch.bincount(cl), 0, cl).view(-1, 1)
+            # get the counts of all the labels for use later
+            scaling.requires_grad = False
+            with torch.autograd.set_detect_anomaly(True):
+                # perform grad descent n times
+                for _ in range(n):
+                    c.requires_grad = True
+                    op.zero_grad()
+                    c1 = LazyTensor(torch.index_select(c, 0, cl).unsqueeze(0))
+                    d = distance(x1, c1)
+                    loss = (
+                        d.sum(0) * scaling
+                    ).sum()  # calculate distance to centroid for each datapoint, divide by total number of points in that cluster, and sum
+                    loss.backward(retain_graph=False)
+                    op.step()
+            return c.detach()  # return the optimised cluster centroids
+
+        N, D = x.shape
+        c = x[:K, :].clone()
+        x_i = LazyTensor(x.view(N, 1, D).to(device))
+
+        for i in range(Niter):
+            c_j = LazyTensor(c.view(1, K, D).to(device))
+            D_ij = distance(x_i, c_j)
+            cl = D_ij.argmin(dim=1).long().view(-1)  # cluster assignment
+
+            # updating c: either with approximation or exact
+            if approx:
+                # approximate with GD optimisation
+                c = calc_centroid(x, c, cl, n)
+
+            else:
+                # exact from average
+                c.zero_()
+                c.scatter_add_(0, cl[:, None].repeat(1, D), x)
+                Ncl = torch.bincount(cl, minlength=K).type_as(c).view(K, 1)
+                c /= Ncl
+            # check if NaN exists, may occur when metric is used incorrectly, eg angular metric with unnormalised data
+            if torch.any(torch.isnan(c)):
+                raise ValueError(
+                    "NaN detected in centroids during KMeans, please check metric is correct"
+                )
+        return cl, c
+
+    @staticmethod
+    def is_tensor(x):
+        return isinstance(x, torch.Tensor)
+
+    @staticmethod
+    def knn_accuracy(indices_test, indices_truth):
+        """
+        Compares the test and ground truth indices (rows = KNN for each point in dataset)
+        Returns the accuracy or proportion of correct nearest neighbours
+
+        Args:
+          indices_test ((N, K) array): K indices obtained from approximate NN search
+          indices_truth ((N, K) array): K indices obtained from exact NN search
+        """
+        N, k = indices_test.shape
+
+        # Calculate number of correct nearest neighbours
+        accuracy = 0
+        for i in range(k):
+            accuracy += torch.sum(indices_test == indices_truth).float() / N
+            indices_truth = torch.roll(
+                indices_truth, 1, -1
+            )  # Create a rolling window (index positions may not match)
+        accuracy = float(accuracy / k)  # percentage accuracy
+
+        return accuracy
+
 
 def squared_distances(x, y):
     x_norm = (x ** 2).sum(1).reshape(-1, 1)
diff --git a/pykeops/tutorials/knn/plot_ivf_numpy.py b/pykeops/tutorials/knn/plot_ivf_numpy.py
new file mode 100644
index 000000000..8bbe728d5
--- /dev/null
+++ b/pykeops/tutorials/knn/plot_ivf_numpy.py
@@ -0,0 +1,121 @@
+"""
+=========================================================
+IVF-Flat approximate nearest neighbors search - Numpy API
+=========================================================
+
+The :class:`pykeops.torch.IVF` class supported by KeOps allows us
+to perform **approximate nearest neighbor search** with four lines of code.
+It can thus be used to compute a **large-scale** nearest neighbors search **much faster**.
+The code is based on the IVF-Flat algorithm and uses KeOps' block-sparse reductions to speed up the search by reducing the search space.
+
+Euclidean, Manhattan and Angular metrics are supported.
+
+.. note::
+  Hyperbolic and custom metrics are not supported in the Numpy API, please use the PyTorch API instead.
+
+"""
+
+########################################################################
+# Setup
+# -----------------
+# Standard imports:
+
+import time
+import numpy as np
+from pykeops.numpy import IVF
+from pykeops.numpy.utils import numpytools
+
+########################################################################
+# IVF nearest neighbour search with Euclidean metric
+# First experiment with N=$10^5$ points in dimension D=3 and 5 nearest neighbours
+
+N, D, k = 10 ** 5, 3, 5
+
+########################################################################
+# Define our dataset:
+
+np.random.seed(1)
+x = 0.7 * np.random.randn(N, D) + 0.3
+y = 0.7 * np.random.randn(N, D) + 0.3
+
+########################################################################
+# Create the IVF class and fit the dataset:
+
+nn = IVF(k=k)
+# set the number of clusters in K-Means to 50
+# set the number of nearest clusters we search over during the final query search to 5
+nn.fit(x, clusters=50, a=5)
+
+########################################################################
+# Query dataset search
+
+approx_nn = nn.kneighbors(y)
+
+########################################################################
+# Now computing the true nearest neighbors with brute force search
+
+true_nn = nn.brute_force(x, y, k=k)
+
+########################################################################
+# Check the performance of our algorithm
+
+print("IVF Recall:", numpytools.knn_accuracy(approx_nn, true_nn))
+
+########################################################################
+# Timing the algorithms to observe their performance
+
+start = time.time()
+iters = 10
+
+# timing KeOps brute force
+for _ in range(iters):
+    true_nn = nn.brute_force(x, y, k=k)
+bf_time = time.time() - start
+print(
+    "KeOps brute force timing for", N, "points with", D, "dimensions:", bf_time / iters
+)
+
+# timing IVF
+nn = IVF(k=k)
+nn.fit(x)
+start = time.time()
+for _ in range(iters):
+    approx_nn = nn.kneighbors(y)
+ivf_time = time.time() - start
+print("KeOps IVF-Flat timing for", N, "points with", D, "dimensions:", ivf_time / iters)
+
+########################################################################
+# IVF nearest neighbors search with angular metric
+# Second experiment with N=$10^5$ points in dimension D=3, with 5 nearest neighbors
+
+np.random.seed(1)
+x = 0.7 * np.random.randn(N, D) + 0.3
+y = 0.7 * np.random.randn(N, D) + 0.3
+
+# normalising the inputs to have norm of 1
+x_norm = x / np.linalg.norm(x, axis=1, keepdims=True)
+y_norm = y / np.linalg.norm(x, axis=1, keepdims=True)
+
+nn = IVF(metric="angular")
+true_nn = nn.brute_force(x_norm, y_norm)
+
+nn = IVF(metric="angular")
+nn.fit(x_norm)
+approx_nn = nn.kneighbors(y_norm)
+print("IVF Recall:", numpytools.knn_accuracy(approx_nn, true_nn))
+
+########################################################################
+# The IVF class also has an option to automatically normalise all inputs
+
+nn = IVF(metric="angular", normalise=True)
+nn.fit(x)
+approx_nn = nn.kneighbors(y)
+print("IVF Recall:", numpytools.knn_accuracy(approx_nn, true_nn))
+
+########################################################################
+# There is also an option to use full angular metric "angular_full", which uses the full angular metric. "angular" simply uses the dot product.
+
+nn = IVF(metric="angular_full")
+nn.fit(x)
+approx_nn = nn.kneighbors(y)
+print("IVF Recall:", numpytools.knn_accuracy(approx_nn, true_nn))
diff --git a/pykeops/tutorials/knn/plot_ivf_torch.py b/pykeops/tutorials/knn/plot_ivf_torch.py
new file mode 100644
index 000000000..d6e6f3c0a
--- /dev/null
+++ b/pykeops/tutorials/knn/plot_ivf_torch.py
@@ -0,0 +1,164 @@
+"""
+===========================================================
+IVF-Flat approximate nearest neighbors search - PyTorch API
+===========================================================
+
+The :class:`pykeops.torch.IVF` class supported by KeOps allows us
+to perform **approximate nearest neighbor search** with four lines of code.
+It can thus be used to compute a **large-scale** nearest neighbors search **much faster**.
+The code is based on the IVF-Flat algorithm and uses KeOps' block-sparse reductions to speed up the search by reducing the search space.
+
+Euclidean, Manhattan, Angular and Hyperbolic metrics are supported along with custom metrics.
+
+.. note::
+  Hyperbolic and custom metrics require the use of an approximation during the K-Means step.
+  This is to obtain the centroid locations since a closed form expression might not be readily available
+"""
+
+###############################################################
+# Setup
+# -----------------
+# Standard imports:
+
+import time
+import torch
+from pykeops.torch import IVF
+from pykeops.torch.utils import torchtools
+
+use_cuda = torch.cuda.is_available()
+device = torch.device("cuda") if use_cuda else torch.device("cpu")
+dtype = torch.float32 if use_cuda else torch.float64
+
+###############################################################
+# IVF nearest neighbour search with Euclidean metric
+# --------------------------------------------------
+# First experiment with N=$10^6$ points in dimension D=3 and 5 nearest neighbours
+
+
+N, D, k = 10 ** 6, 3, 5
+
+###############################################################
+# Define our dataset:
+
+torch.manual_seed(1)
+x = 0.7 * torch.randn(N, D, dtype=dtype, device=device) + 0.3
+y = 0.7 * torch.randn(N, D, dtype=dtype, device=device) + 0.3
+
+###############################################################
+# Create the IVF class and fit the dataset:
+
+nn = IVF(k=k)
+# set the number of clusters in K-Means to 50
+# set the number of nearest clusters we search over during the final query search to 5
+nn.fit(x, clusters=50, a=5)
+
+###############################################################
+# Query dataset search
+
+approx_nn = nn.kneighbors(y)
+
+###############################################################
+# Now computing the true nearest neighbors with brute force search
+
+true_nn = nn.brute_force(x, y, k=k)
+
+###############################################################
+# Check the performance of our algorithm
+
+print("IVF Recall:", torchtools.knn_accuracy(approx_nn, true_nn))
+
+###############################################################
+# Timing the algorithms to observe their performance
+
+start = time.time()
+iters = 10
+
+# timing KeOps brute force
+for _ in range(iters):
+    true_nn = nn.brute_force(x, y, k=k)
+bf_time = time.time() - start
+print(
+    "KeOps brute force timing for", N, "points with", D, "dimensions:", bf_time / iters
+)
+
+# timing IVF
+nn = IVF(k=k)
+nn.fit(x)
+start = time.time()
+for _ in range(iters):
+    approx_nn = nn.kneighbors(y)
+ivf_time = time.time() - start
+print("KeOps IVF-Flat timing for", N, "points with", D, "dimensions:", ivf_time / iters)
+
+###############################################################
+# IVF nearest neighbors search with angular metric
+# Second experiment with N=$10^6$ points in dimension D=3, with 5 nearest neighbors
+
+torch.manual_seed(1)
+x = 0.7 * torch.randn(N, D, dtype=dtype, device=device) + 0.3
+y = 0.7 * torch.randn(N, D, dtype=dtype, device=device) + 0.3
+
+# normalising the inputs to have norm of 1
+x_norm = x / torch.linalg.norm(x, dim=1, keepdim=True)
+y_norm = y / torch.linalg.norm(y, dim=1, keepdim=True)
+
+nn = IVF(metric="angular")
+true_nn = nn.brute_force(x_norm, y_norm)
+
+nn = IVF(metric="angular")
+nn.fit(x_norm)
+approx_nn = nn.kneighbors(y_norm)
+print("IVF Recall:", torchtools.knn_accuracy(approx_nn, true_nn))
+
+###############################################################
+# The IVF class also has an option to automatically normalise all inputs
+
+nn = IVF(metric="angular", normalise=True)
+nn.fit(x)
+approx_nn = nn.kneighbors(y)
+print("IVF Recall:", torchtools.knn_accuracy(approx_nn, true_nn))
+
+###############################################################
+# There is also an option to use full angular metric "angular_full", which uses the full angular metric. "angular" simply uses the dot product.
+
+nn = IVF(metric="angular_full")
+nn.fit(x)
+approx_nn = nn.kneighbors(y)
+print("IVF Recall:", torchtools.knn_accuracy(approx_nn, true_nn))
+
+###############################################################
+# IVF nearest neighbors search with approximations for K-Means centroids
+# We run two experiment with N=$10^6$ points in dimension D=3, with 5 nearest neighbors. The first uses the hyperbolic metric while the second uses a custom metric.
+
+# hyperbolic data generation
+torch.manual_seed(1)
+x = 0.5 + torch.rand(N, D, dtype=dtype, device=device)
+y = 0.5 + torch.rand(N, D, dtype=dtype, device=device)
+
+nn = IVF(metric="hyperbolic")
+# set approx to True
+# n is the number of times we run gradient descent steps for the approximation, default of 50
+nn.fit(x, approx=True, n=50)
+approx_nn = nn.kneighbors(y)
+true_nn = nn.brute_force(x, y)
+print("IVF Recall:", torchtools.knn_accuracy(approx_nn, true_nn))
+
+# define a custom metric
+def minkowski(x, y, p=3):
+    """Returns the computation of a metric
+    Note the shape of the input tensors the function should accept
+
+    Args:
+      x (tensor): Input dataset of size 1, N, D
+      y (tensor): Query dataset of size M, 1, D
+
+    """
+    return ((x - y).abs() ** p).sum(-1)
+
+
+# testing custom metric
+nn = IVF(metric=minkowski)
+nn.fit(x, approx=True)
+approx_nn = nn.kneighbors(y)
+true_nn = nn.brute_force(x, y)
+print("IVF Recall:", torchtools.knn_accuracy(approx_nn, true_nn))
diff --git a/pykeops/tutorials/knn/plot_nnd_torch.py b/pykeops/tutorials/knn/plot_nnd_torch.py
new file mode 100644
index 000000000..1e39c2ba0
--- /dev/null
+++ b/pykeops/tutorials/knn/plot_nnd_torch.py
@@ -0,0 +1,154 @@
+"""
+================================
+ Nearest Neighbors Descent (NND) approximate nearest neighbors search - PyTorch API
+================================
+
+The :class:`pykeops.torch.NND` class supported by KeOps allows us
+to perform **approximate nearest neighbor search** with four lines of code.
+
+Euclidean and Manhattan metrics are supported.
+
+.. note::
+  NNDescent is not fully optimised and we recommend the use of IVF-Flat instead.
+  Nevertheless, we provide NNDescent as a means of benchmarking cutting edge nearest neighbor search algorithms
+
+"""
+
+########################################################################
+# Setup
+# -----------------
+# Standard imports:
+
+import time
+import torch
+from pykeops.torch import NND
+from pykeops.torch.utils import torchtools
+
+use_cuda = torch.cuda.is_available()
+device = torch.device("cuda") if use_cuda else torch.device("cpu")
+dtype = torch.float32 if use_cuda else torch.float64
+
+########################################################################
+# NNDescent search with Euclidean metric
+# First experiment with N=$10^4$ points in dimension D=3 and 5 nearest neighbours, and default hyperparameters.
+
+N, D, k = 10 ** 4, 3, 5
+
+########################################################################
+# Define our dataset:
+
+torch.manual_seed(1)
+x = 0.7 * torch.randn(N, D, dtype=dtype) + 0.3
+y = 0.7 * torch.randn(N, D, dtype=dtype) + 0.3
+
+########################################################################
+# Create the NND class and fit the dataset:
+
+nn = NNDescent(k=k)
+nn.fit(x, queue=8)
+
+########################################################################
+# Query dataset search
+
+approx_nn = nn.kneighbors(y)
+
+########################################################################
+# Define the function to compute the true nearest neighbors with brute force search
+
+
+def brute_force(x, y, k, metric):
+    x_i = LazyTensor(x.unsqueeze(0).to(device))
+    y_j = LazyTensor(y.unsqueeze(1).to(device))
+    if metric == "euclidean":
+        D_ij = ((x_i - y_j) ** 2).sum(-1)
+    elif metric == "manhattan":
+        D_ij = ((x_i - y_j).abs()).sum(-1)
+    indices = D_ij.argKmin(K=k, dim=1).long()
+    return indices
+
+
+########################################################################
+# Compute the true nearest neighbors with brute force search using Euclidean distance
+
+indices = brute_force(x=x, y=y, k=k, metric="euclidean")
+
+########################################################################
+# Check the performance of our algorithm
+
+print("NND Recall:", torchtools.knn_accuracy(approx_nn.to(device), indices))
+
+########################################################################
+# Define function to time the algorithms to observe their performance
+
+
+def timing(x, y, k, N, D, metric):
+    start = time.time()
+    iters = 10
+
+    # timing KeOps brute force
+    for _ in range(iters):
+        indices = brute_force(x=x, y=y, k=k, metric=metric)
+    bf_time = time.time() - start
+    print(
+        "KeOps brute force timing for",
+        N,
+        "points with",
+        D,
+        "dimensions:",
+        bf_time / iters,
+    )
+
+    # timing NNDescent
+    start = time.time()
+    for _ in range(iters):
+        approx_nn = nn.kneighbors(y)
+    nnd_time = time.time() - start
+    print("KeOps NND timing for", N, "points with", D, "dimensions:", nnd_time / iters)
+
+
+########################################################################
+# Timing the algorithms to observe their performance
+
+timing(x=x, y=y, k=k, N=N, D=D, metric="euclidean")
+
+########################################################################
+
+# NNDescent search using clusters and Manhattan distance
+# Second experiment with N=$10^6$ points in dimension D=3, with 5 nearest neighbors and manhattan distance.
+
+N, D, k = 10 ** 6, 3, 5
+
+########################################################################
+# Define our dataset:
+
+torch.manual_seed(1)
+x = 0.7 * torch.randn(N, D, dtype=dtype) + 0.3
+x = x.to(device)
+y = 0.7 * torch.randn(N, D, dtype=dtype) + 0.3
+y = y.to(device)
+
+########################################################################
+# Create the NNDescent class and fit the dataset:
+
+nn = NNDescent(k=k, metric="manhattan", initialization_method="cluster")
+nn.fit(x, a=10, queue=5, clusters=64)
+
+########################################################################
+# Query dataset search
+
+approx_nn = nn.kneighbors(y)
+
+########################################################################
+# Now computing the true nearest neighbors with brute force search using Manhattan distance
+
+indices = brute_force(x=x, y=y, k=k, metric="manhattan")
+
+########################################################################
+# Check the performance of our algorithm
+
+print("NND Recall:", torchtools.knn_accuracy(approx_nn.to(device), indices))
+
+########################################################################
+# Timing the algorithms to observe their performance
+
+timing(x=x, y=y, k=k, N=N, D=D, metric="manhattan")
diff --git a/setup.py b/setup.py
index 55597515f..936610406 100644
--- a/setup.py
+++ b/setup.py
@@ -103,12 +103,16 @@ def import_files(dirname, ext=["h", "hpp"]):
         "pykeops.numpy.generic",
         "pykeops.numpy.lazytensor",
         "pykeops.numpy.shape_distance",
+        "pykeops.numpy.knn",
+        "pykeops.numpy.nystrom",
         "pykeops.test",
         "pykeops.torch",
         "pykeops.torch.cluster",
         "pykeops.torch.generic",
         "pykeops.torch.lazytensor",
         "pykeops.torch.kernel_product",
+        "pykeops.torch.knn",
+        "pykeops.torch.nystrom",
     ],
     package_data={
         "pykeops": [