diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index ad90c47..45c55a1 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -19,6 +19,8 @@ jobs:
 
     steps:
       - uses: actions/checkout@v4
+        with:
+          fetch-depth: "0"
 
       - name: Set up Python 3.11
         uses: actions/setup-python@v5
@@ -28,8 +30,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          pip install --editable .[check,test]
-          pip install "coveralls>=3.0.0"
+          pip install -v --editable '.[check]'
 
       - name: black check
         run: |
@@ -43,64 +44,137 @@ jobs:
         run: |
           python -m isort --check --diff --color .
 
-      - name: pylint check
+      - name: ruff check
         run: |
-          python -m pylint src/anaflow/
+          python -m ruff check src/
 
-      - name: coveralls check
+      - name: cython-lint check
         run: |
-          python -m pytest --cov anaflow --cov-report term-missing -v tests/
-          python -m coveralls --service=github
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          cython-lint src/anaflow/
 
-  build_sdist:
-    name: sdist on ${{ matrix.os }} with py ${{ matrix.python-version }}
+  build_wheels:
+    name: wheels for ${{ matrix.os }}
     runs-on: ${{ matrix.os }}
     strategy:
       fail-fast: false
       matrix:
-        os: [ubuntu-latest, windows-latest, macos-13]
-        python-version: ['3.8', '3.9', '3.10', '3.11', '3.12', '3.13']
+        os: [ubuntu-latest, ubuntu-24.04-arm, windows-latest, macos-15-intel, macos-latest] # windows-11-arm (no scipy)
 
     steps:
       - uses: actions/checkout@v4
         with:
-          fetch-depth: '0'
+          fetch-depth: "0"
+
+      - name: Build wheels
+        uses: pypa/cibuildwheel@v3.2.1
+        with:
+          output-dir: dist-wheel-${{ matrix.os }}
+
+      - uses: actions/upload-artifact@v4
+        with:
+          name: dist-wheel-${{ matrix.os }}
+          path: ./dist-wheel-${{ matrix.os }}/*.whl
 
-      - name: Set up Python ${{ matrix.python-version }}
+  build_sdist:
+    name: sdist on ${{ matrix.os }} with py ${{ matrix.ver.py }} numpy${{ matrix.ver.np }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os:
+          [ubuntu-latest, ubuntu-24.04-arm, windows-latest, macos-14]
+        ver:
+          - { py: "3.9", np: "==1.20.0", sp: "==1.5.4" }
+          - { py: "3.10", np: "==1.21.6", sp: "==1.7.2" }
+          - { py: "3.11", np: "==1.23.2", sp: "==1.9.2" }
+          - { py: "3.12", np: "==1.26.2", sp: "==1.11.2" }
+          - { py: "3.13", np: "==2.1.0", sp: "==1.14.1" }
+          - { py: "3.14", np: "==2.3.2", sp: "==1.16.1" }
+          - { py: "3.14", np: ">=2.3.2", sp: ">=1.16.1" }
+        exclude:
+          - os: ubuntu-24.04-arm
+            ver: { py: "3.8", np: "==1.20.0", sp: "==1.5.4" }
+          - os: macos-14
+            ver: { py: "3.8", np: "==1.20.0", sp: "==1.5.4" }
+          - os: macos-14
+            ver: { py: "3.9", np: "==1.20.0", sp: "==1.5.4" }
+          - os: macos-14
+            ver: { py: "3.10", np: "==1.21.6", sp: "==1.7.2" }
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: "0"
+
+      - name: Set up Python ${{ matrix.ver.py }}
         uses: actions/setup-python@v5
         with:
-          python-version: ${{ matrix.python-version }}
+          python-version: ${{ matrix.ver.py }}
 
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
           pip install build
-          pip install --editable .[test]
+
+      - name: Install anaflow
+        run: |
+          pip install -v --editable .[test]
 
       - name: Run tests
         run: |
-          python -m pytest --cov anaflow --cov-report term-missing -v tests/
+          pip install "numpy${{ matrix.ver.np }}" "scipy${{ matrix.ver.sp }}"
+          python -m pytest -v tests/
 
       - name: Build sdist
         run: |
-          python -m build
+          # PEP 517 package builder from pypa
+          python -m build --sdist --outdir dist-sdist .
 
       - uses: actions/upload-artifact@v4
-        if: matrix.os == 'ubuntu-latest' && matrix.python-version == '3.11'
+        if: matrix.os == 'ubuntu-latest' && matrix.ver.py == '3.11'
         with:
-          name: dist
-          path: dist/*
+          name: dist-sdist
+          path: dist-sdist/*.tar.gz
+
+  coverage:
+    name: coverage
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: '0'
+
+      - name: Set up Python 3.11
+        uses: actions/setup-python@v5
+        with:
+          python-version: 3.11
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install "coveralls>=3.0.0"
+
+      - name: Install anaflow
+        run: |
+          pip install -v --editable .[test]
+
+      - name: Run tests
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          pip install "numpy${{ matrix.ver.np }}"
+          python -m pytest --cov anaflow --cov-report term-missing -v tests/
+          python -m coveralls --service=github
 
   upload_to_pypi:
-    needs: [build_sdist]
+    needs: [build_wheels, build_sdist]
     runs-on: ubuntu-latest
 
     steps:
       - uses: actions/download-artifact@v4
         with:
-          name: dist
+          pattern: dist-*
+          merge-multiple: true
           path: dist
 
       - name: Publish to Test PyPI
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..363b483
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,4 @@
+prune **
+recursive-include tests *.py
+recursive-include src/anaflow *.py *.pyx
+include LICENSE README.md pyproject.toml setup.py
diff --git a/pyproject.toml b/pyproject.toml
index 1f638a0..8809f51 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,16 +1,22 @@
 [build-system]
 requires = [
-    "hatchling>=1.8.0",
-    "hatch-vcs",
+    "setuptools>=77",
+    "wheel",
+    "Cython>=3",
+    "numpy>=2",
+    "scipy>=1.5.4",
+    "pentapy>=1.1.0,<2",
+    "setuptools_scm[toml]>=7",
 ]
-build-backend = "hatchling.build"
+build-backend = "setuptools.build_meta"
 
 [project]
-requires-python = ">=3.8"
+requires-python = ">=3.9"
 name = "anaflow"
 authors = [{name = "Sebastian Mueller", email = "sebastian.mueller@ufz.de"}]
 readme = "README.md"
 license = "MIT"
+license-files = ["LICENSE"]
 dynamic = ["version"]
 description = "AnaFlow - analytical solutions for the groundwater-flow equation."
 classifiers = [
@@ -18,7 +24,6 @@ classifiers = [
     "Intended Audience :: Developers",
     "Intended Audience :: End Users/Desktop",
     "Intended Audience :: Science/Research",
-    "License :: OSI Approved :: MIT License",
     "Natural Language :: English",
     "Operating System :: MacOS",
     "Operating System :: MacOS :: MacOS X",
@@ -29,20 +34,20 @@ classifiers = [
     "Programming Language :: Python",
     "Programming Language :: Python :: 3",
     "Programming Language :: Python :: 3 :: Only",
-    "Programming Language :: Python :: 3.8",
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
     "Programming Language :: Python :: 3.13",
+    "Programming Language :: Python :: 3.14",
     "Topic :: Scientific/Engineering",
     "Topic :: Software Development",
     "Topic :: Utilities",
 ]
 dependencies = [
-    "numpy>=1.14.5",
+    "numpy>=1.20.0",
     "pentapy>=1.1.0,<2",
-    "scipy>=1.1.0",
+    "scipy>=1.5.4",
 ]
 
 [project.optional-dependencies]
@@ -58,7 +63,8 @@ test = ["pytest-cov>=3"]
 check = [
   "black>=25",
   "isort[colors]",
-  "pylint",
+  "ruff>=0.5.0",
+  "cython-lint",
 ]
 
 [project.urls]
@@ -69,25 +75,11 @@ Tracker = "https://github.com/GeoStat-Framework/anaflow/issues"
 Changelog = "https://github.com/GeoStat-Framework/anaflow/blob/main/CHANGELOG.md"
 Conda-Forge = "https://anaconda.org/conda-forge/anaflow"
 
-[tool.hatch.version]
-source = "vcs"
-fallback_version = "0.0.0.dev0"
-
-[tool.hatch.version.raw-options]
+[tool.setuptools_scm]
+write_to = "src/anaflow/_version.py"
+write_to_template = "__version__ = '{version}'"
 local_scheme = "no-local-version"
-
-[tool.hatch.build.hooks.vcs]
-version-file = "src/anaflow/_version.py"
-template = "__version__ = '{version}'"
-
-[tool.hatch.build.targets.sdist]
-include = [
-  "/src",
-  "/tests",
-]
-
-[tool.hatch.build.targets.wheel]
-packages = ["src/anaflow"]
+fallback_version = "0.0.0.dev0"
 
 [tool.isort]
 profile = "black"
@@ -95,7 +87,6 @@ multi_line_output = 3
 
 [tool.black]
 target-version = [
-    "py38",
     "py39",
     "py310",
     "py311",
@@ -120,27 +111,27 @@ target-version = [
         "def __str__",
     ]
 
-[tool.pylint]
-    [tool.pylint.master]
-    extension-pkg-whitelist = [
-        "numpy",
-        "scipy",
-    ]
-    ignore = "_version.py"
-
-    [tool.pylint.message_control]
-    disable = [
-        "R0801",
-    ]
-
-    [tool.pylint.reports]
-    output-format = "colorized"
+[tool.ruff]
+line-length = 88
+target-version = "py39"
+exclude = ["_version.py"]
+
+[tool.ruff.lint]
+select = [
+    "E",
+    "F",
+    "W",
+    "I",
+]
 
-    [tool.pylint.design]
-    max-args = 25
-    max-locals = 50
-    max-branches = 30
-    max-statements = 80
-    max-attributes = 25
-    max-public-methods = 75
-    max-positional-arguments=25
+[tool.cibuildwheel]
+# Switch to using build
+build-frontend = "build"
+# Disable building py3.6/7/8, pp3.8, 32bit linux
+skip = ["*-win32", "*_i686", "*-musllinux_*", "cp31?t-*"]
+# Run the package tests using `pytest`
+test-extras = "test"
+test-command = "pytest -v {package}/tests"
+
+environment.PIP_ONLY_BINARY = "numpy,scipy"
+environment.PIP_PREFER_BINARY = "1"
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..6f363bf
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,31 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+import numpy as np
+from Cython.Build import cythonize
+from setuptools import Extension, setup
+
+SRC_DIR = Path("src")
+MODULE_PATH = SRC_DIR / "anaflow" / "flow"
+MODULE_NAME = "anaflow.flow._laplace_accel"
+
+extensions = [
+    Extension(
+        MODULE_NAME,
+        [str(MODULE_PATH / "_laplace_accel.pyx")],
+        include_dirs=[np.get_include()],
+    )
+]
+
+extensions = cythonize(
+    extensions,
+    language_level=3,
+    compiler_directives={
+        "boundscheck": False,
+        "wraparound": False,
+        "initializedcheck": False,
+    },
+)
+
+setup(ext_modules=extensions, package_dir={"": "src"})
diff --git a/src/anaflow/flow/_laplace_accel.pyx b/src/anaflow/flow/_laplace_accel.pyx
new file mode 100644
index 0000000..b9d4ad1
--- /dev/null
+++ b/src/anaflow/flow/_laplace_accel.pyx
@@ -0,0 +1,330 @@
+# cython: language_level=3
+# cython: boundscheck=False
+# cython: wraparound=False
+# cython: cdivision=True
+# cython: initializedcheck=False
+# cython: nonecheck=False
+"""
+Cython-based acceleration kernels for :mod:`anaflow.flow.laplace`.
+
+The Python wrapper takes care of all argument validation and prepares the
+NumPy arrays that are passed into these routines.
+"""
+
+from libc.math cimport fabs, isinf, isnan, sqrt
+
+import numpy as np
+
+cimport numpy as np
+from pentapy.solver cimport c_penta_solver2
+from scipy.special.cython_special cimport iv as cy_iv
+from scipy.special.cython_special cimport kv as cy_kv
+
+ctypedef np.float64_t DTYPE_t
+ctypedef np.intp_t ITYPE_t
+
+
+cdef inline bint _should_zero(double value, double cut_off) nogil:
+    """Return True if the value should be treated as zero."""
+    if cut_off > 0.0:
+        return fabs(value) < cut_off
+    return value == 0.0
+
+
+cdef inline void _shift_col_to_row(double[:, :] mat, Py_ssize_t width) nogil:
+    """In-place conversion from column-wise to row-wise banded storage."""
+    cdef Py_ssize_t up = 2
+    cdef Py_ssize_t low = 2
+    cdef Py_ssize_t i, j, shift, row
+
+    for i in range(up):
+        shift = up - i
+        if shift >= width:
+            for j in range(width):
+                mat[i, j] = 0.0
+            continue
+        for j in range(width - shift):
+            mat[i, j] = mat[i, j + shift]
+        for j in range(width - shift, width):
+            mat[i, j] = 0.0
+
+    for i in range(low):
+        shift = low - i
+        row = 4 - i
+        if shift >= width:
+            for j in range(width):
+                mat[row, j] = 0.0
+            continue
+        for j in range(width - 1, shift - 1, -1):
+            mat[row, j] = mat[row, j - shift]
+        for j in range(shift):
+            mat[row, j] = 0.0
+
+
+def solve_homogeneous(
+    np.ndarray[DTYPE_t, ndim=1] s,
+    np.ndarray[DTYPE_t, ndim=1] rad,
+    np.ndarray[DTYPE_t, ndim=1] rad_pow,
+    double r_well,
+    double r_outer,
+    double diff_sr0,
+    double nu,
+    double nu1,
+    double gamma_1_minus_nu,
+    double two_over_gamma_nu,
+    np.ndarray[DTYPE_t, ndim=1] qs,
+    double cut_off_prec,
+    np.ndarray[DTYPE_t, ndim=2] out,
+) -> None:
+    """Homogeneous aquifer kernel."""
+    cdef Py_ssize_t ns = s.shape[0]
+    cdef Py_ssize_t nr = rad.shape[0]
+    cdef Py_ssize_t si, ri
+    cdef double se, cs, row00, row01, row10, row11
+    cdef double rhs0, rhs1, det, inv_det, as_, bs
+    cdef double rad_val, cs_rad, term_k, term_i, value
+    cdef bint finite_outer = not isinf(r_outer)
+    cdef double cut = cut_off_prec if cut_off_prec > 0.0 else 0.0
+
+    for si in range(ns):
+        se = s[si]
+        cs = sqrt(se) * diff_sr0 if se >= 0.0 else 0.0
+        row00 = -gamma_1_minus_nu
+        row01 = two_over_gamma_nu
+        row10 = 0.0
+        row11 = 1.0
+
+        if r_well > 0.0:
+            value = cs * r_well
+            row00 = -cy_kv(nu1, value)
+            row01 = cy_iv(nu1, value)
+        if finite_outer:
+            value = cs * r_outer
+            row10 = cy_kv(nu, value)
+            row11 = cy_iv(nu, value)
+        else:
+            row01 = 0.0
+
+        rhs0 = qs[si]
+        rhs1 = 0.0
+        det = row00 * row11 - row01 * row10
+        if cut > 0.0 and fabs(det) < cut:
+            as_ = 0.0
+            bs = 0.0
+        elif cut == 0.0 and det == 0.0:
+            as_ = 0.0
+            bs = 0.0
+        else:
+            inv_det = 1.0 / det
+            as_ = (rhs0 * row11 - rhs1 * row01) * inv_det
+            bs = (row00 * rhs1 - row10 * rhs0) * inv_det
+
+        for ri in range(nr):
+            rad_val = rad[ri]
+            if finite_outer and not (rad_val < r_outer):
+                out[si, ri] = 0.0
+                continue
+
+            cs_rad = cs * rad_val
+            term_k = 0.0
+            term_i = 0.0
+
+            if cut > 0.0:
+                if fabs(as_) >= cut:
+                    term_k = as_ * cy_kv(nu, cs_rad)
+                if fabs(bs) >= cut:
+                    term_i = bs * cy_iv(nu, cs_rad)
+            else:
+                if as_ != 0.0:
+                    term_k = as_ * cy_kv(nu, cs_rad)
+                if bs != 0.0:
+                    term_i = bs * cy_iv(nu, cs_rad)
+
+            value = rad_pow[ri] * (term_k + term_i)
+            if isnan(value) or isinf(value):
+                value = 0.0
+            out[si, ri] = value
+
+
+def solve_multilayer(
+    np.ndarray[DTYPE_t, ndim=1] s,
+    np.ndarray[DTYPE_t, ndim=1] rad,
+    np.ndarray[DTYPE_t, ndim=1] rad_pow,
+    np.ndarray[DTYPE_t, ndim=1] r_part,
+    np.ndarray[DTYPE_t, ndim=1] difsr,
+    np.ndarray[DTYPE_t, ndim=1] tmp,
+    np.ndarray[ITYPE_t, ndim=1] pos,
+    double nu,
+    double nu1,
+    double gamma_1_minus_nu,
+    double two_over_gamma_nu,
+    np.ndarray[DTYPE_t, ndim=1] qs,
+    double cut_off_prec,
+    np.ndarray[DTYPE_t, ndim=2] out,
+) -> None:
+    """General multi-layer kernel."""
+    cdef Py_ssize_t ns = s.shape[0]
+    cdef Py_ssize_t nr = rad.shape[0]
+    cdef Py_ssize_t parts = difsr.shape[0]
+    cdef Py_ssize_t width_total = 2 * parts
+    cdef Py_ssize_t si, ri, i, row, col, first, width, idx, p
+    cdef double se, val, cs_i, cs_ip1, tmp_i, r_interface
+    cdef double cs_rad, term_k, term_i, value
+    cdef double row00, row01, row10, row11, rhs0, rhs1, det, inv_det
+    cdef double inv_cutoff = 0.0
+    cdef double cut = cut_off_prec if cut_off_prec > 0.0 else 0.0
+    cdef double even_val, odd_val
+    cdef bint finite_outer = np.isfinite(r_part[r_part.shape[0] - 1])
+
+    if cut_off_prec > 0.0:
+        inv_cutoff = 1.0 / cut_off_prec
+
+    mb_np = np.zeros((5, width_total), dtype=np.float64)
+    rhs_np = np.zeros(width_total, dtype=np.float64)
+    x_np = np.zeros(width_total, dtype=np.float64)
+    cs_np = np.zeros(parts, dtype=np.float64)
+    col_max_np = np.zeros(width_total, dtype=np.float64)
+    work_np = np.zeros((5, width_total), dtype=np.float64)
+
+    cdef double[:, :] mb = mb_np
+    cdef double[:] rhs = rhs_np
+    cdef double[:] x = x_np
+    cdef double[:] cs = cs_np
+    cdef double[:] col_max = col_max_np
+    cdef double[:, :] work = work_np
+
+    for si in range(ns):
+        se = s[si]
+        if se >= 0.0:
+            for i in range(parts):
+                cs[i] = sqrt(se) * difsr[i]
+        else:
+            for i in range(parts):
+                cs[i] = 0.0
+
+        for row in range(5):
+            for col in range(width_total):
+                mb[row, col] = 0.0
+
+        for idx in range(width_total):
+            rhs[idx] = 0.0
+            x[idx] = 0.0
+
+        rhs[0] = qs[si]
+        mb[2, 0] = -gamma_1_minus_nu
+        mb[1, 1] = two_over_gamma_nu
+        mb[2, width_total - 1] = 1.0
+
+        for i in range(parts - 1):
+            r_interface = r_part[i + 1]
+            cs_i = cs[i] * r_interface
+            cs_ip1 = cs[i + 1] * r_interface
+            tmp_i = tmp[i]
+
+            mb[0, 2 * i + 3] = -cy_iv(nu, cs_ip1)
+            mb[1, 2 * i + 2] = -cy_kv(nu, cs_ip1)
+            mb[1, 2 * i + 3] = -cy_iv(nu1, cs_ip1)
+            mb[2, 2 * i + 1] = cy_iv(nu, cs_i)
+            mb[2, 2 * i + 2] = cy_kv(nu1, cs_ip1)
+            mb[3, 2 * i] = cy_kv(nu, cs_i)
+            mb[3, 2 * i + 1] = tmp_i * cy_iv(nu1, cs_i)
+            mb[4, 2 * i] = -tmp_i * cy_kv(nu1, cs_i)
+
+        if r_part[0] > 0.0:
+            val = cs[0] * r_part[0]
+            mb[2, 0] = -cy_kv(nu1, val)
+            mb[1, 1] = cy_iv(nu1, val)
+        if finite_outer:
+            val = cs[parts - 1] * r_part[r_part.shape[0] - 1]
+            mb[3, width_total - 2] = cy_kv(nu, val)
+            mb[2, width_total - 1] = cy_iv(nu, val)
+        else:
+            mb[0, width_total - 1] = 0.0
+            mb[1, width_total - 1] = 0.0
+
+        first = parts
+        for col in range(width_total):
+            value = 0.0
+            for row in range(5):
+                val = fabs(mb[row, col])
+                if val > value:
+                    value = val
+            col_max[col] = value
+            if cut > 0.0:
+                if value < cut or (inv_cutoff > 0.0 and value > inv_cutoff):
+                    first = col // 2
+                    break
+            else:
+                if value == 0.0:
+                    first = col // 2
+                    break
+
+        if first > parts:
+            first = parts
+
+        for idx in range(2 * first, width_total):
+            x[idx] = 0.0
+
+        if first <= 1:
+            row00 = mb[2, 0]
+            row01 = mb[1, 1]
+            row10 = mb[3, 0]
+            row11 = mb[2, 1]
+            rhs0 = rhs[0]
+            rhs1 = rhs[1]
+            det = row00 * row11 - row01 * row10
+            if cut > 0.0 and fabs(det) < cut:
+                x[0] = 0.0
+                x[1] = 0.0
+            elif cut == 0.0 and det == 0.0:
+                x[0] = 0.0
+                x[1] = 0.0
+            else:
+                inv_det = 1.0 / det
+                x[0] = (rhs0 * row11 - rhs1 * row01) * inv_det
+                x[1] = (row00 * rhs1 - row10 * rhs0) * inv_det
+        else:
+            width = 2 * first
+            for row in range(5):
+                for col in range(width):
+                    work[row, col] = mb[row, col]
+            if first < parts:
+                work[4, width - 1] = 0.0
+                work[3, width - 1] = 0.0
+                work[4, width - 2] = 0.0
+            _shift_col_to_row(work, width)
+            try:
+                sol = c_penta_solver2(work[:, :width], rhs[:width])
+                for idx in range(width):
+                    x[idx] = sol[idx]
+            except ZeroDivisionError:
+                for idx in range(width):
+                    x[idx] = 0.0
+
+        for idx in range(2 * first):
+            if _should_zero(x[idx], cut):
+                x[idx] = 0.0
+
+        for ri in range(nr):
+            p = pos[ri]
+            cs_rad = cs[p] * rad[ri]
+            term_k = 0.0
+            term_i = 0.0
+            even_val = x[2 * p]
+            odd_val = x[2 * p + 1]
+
+            if cut > 0.0:
+                if fabs(even_val) >= cut:
+                    term_k = even_val * cy_kv(nu, cs_rad)
+                if fabs(odd_val) >= cut:
+                    term_i = odd_val * cy_iv(nu, cs_rad)
+            else:
+                if even_val != 0.0:
+                    term_k = even_val * cy_kv(nu, cs_rad)
+                if odd_val != 0.0:
+                    term_i = odd_val * cy_iv(nu, cs_rad)
+
+            value = rad_pow[ri] * (term_k + term_i)
+            if isnan(value) or isinf(value):
+                value = 0.0
+            out[si, ri] = value
diff --git a/src/anaflow/flow/laplace.py b/src/anaflow/flow/laplace.py
index a47b917..106a831 100644
--- a/src/anaflow/flow/laplace.py
+++ b/src/anaflow/flow/laplace.py
@@ -12,14 +12,20 @@
 """
 
 # pylint: disable=C0103,R0915
-import warnings
-
 import numpy as np
-from pentapy import solve
-from scipy.special import erfcx, gamma, iv, kv
+from scipy.special import erfcx, gamma
 
 from anaflow.tools.special import sph_surf
 
+try:
+    from ._laplace_accel import solve_homogeneous as _solve_homogeneous
+    from ._laplace_accel import solve_multilayer as _solve_multilayer
+except ImportError as exc:  # pragma: no cover - extension is mandatory
+    raise ImportError(
+        "anaflow.flow._laplace_accel extension missing. "
+        "Please build the Cython module (e.g. `python setup.py build_ext --inplace`)."
+    ) from exc
+
 __all__ = ["grf_laplace"]
 
 
@@ -130,13 +136,15 @@ def grf_laplace(
            [-4.58447458e-01, -1.12056319e-02, -9.85673855e-04]])
     """
     cond_kw = {} if cond_kw is None else cond_kw
-    cond = cond if callable(cond) else PUMP_COND[cond]
-    # ensure that input is treated as arrays
-    s = np.squeeze(s).reshape(-1)
-    rad = np.squeeze(rad).reshape(-1)
-    S_part = np.squeeze(S_part).reshape(-1)
-    K_part = np.squeeze(K_part).reshape(-1)
-    R_part = np.squeeze(R_part).reshape(-1)
+    pump_cond = cond if callable(cond) else PUMP_COND[cond]
+
+    # ensure that input is treated as contiguous arrays
+    s = np.ascontiguousarray(np.atleast_1d(np.asarray(s, dtype=np.float64)))
+    rad = np.ascontiguousarray(np.atleast_1d(np.asarray(rad, dtype=np.float64)))
+    S_part = np.ascontiguousarray(np.atleast_1d(np.asarray(S_part, dtype=np.float64)))
+    K_part = np.ascontiguousarray(np.atleast_1d(np.asarray(K_part, dtype=np.float64)))
+    R_part = np.ascontiguousarray(np.atleast_1d(np.asarray(R_part, dtype=np.float64)))
+
     # the dimension is used by nu in the literature (See Barker 88)
     dim = float(dim)
     nu = 1.0 - dim / 2.0
@@ -148,6 +156,7 @@ def grf_laplace(
     parts = len(K_part)
     # set the conductivity at the well
     K_well = K_part[0] if K_well is None else float(K_well)
+
     # check the input
     if not len(R_part) - 1 == len(S_part) == len(K_part) > 0:
         raise ValueError("R_part, S_part and K_part need matching lengths.")
@@ -168,148 +177,70 @@ def grf_laplace(
     if K_well <= 0:
         raise ValueError("The well conductivity needs to be positiv.")
 
-    # initialize the result
-    res = np.zeros(s.shape + rad.shape)
-    # the first sqrt of the diffusivity values
-    diff_sr0 = np.sqrt(S_part[0] / K_part[0])
-    # set the general pumping-condtion depending on the well-radius
+    cut_off_prec = float(cut_off_prec)
+
+    # pre-compute helper arrays
+    res = np.zeros((s.size, rad.size), dtype=np.float64)
+    diff_sr0 = float(np.sqrt(S_part[0] / K_part[0]))
+    cond_vals = np.asarray(pump_cond(s, **cond_kw), dtype=np.float64)
+    if cond_vals.shape != s.shape:
+        cond_vals = np.broadcast_to(cond_vals, s.shape).astype(np.float64, copy=True)
+    cond_vals = np.ascontiguousarray(cond_vals, dtype=np.float64)
+
     if R_part[0] > 0.0:
-        Qs = -(s ** (-0.5)) / diff_sr0 * R_part[0] ** nu1 * cond(s, **cond_kw)
+        qs = -np.power(s, -0.5) / diff_sr0 * R_part[0] ** nu1 * cond_vals
     else:
-        Qs = -((2 / diff_sr0) ** nu) * s ** (-nu / 2) * cond(s, **cond_kw)
+        qs = -np.power(2.0 / diff_sr0, nu) * np.power(s, -nu / 2.0) * cond_vals
+    qs = np.ascontiguousarray(qs, dtype=np.float64)
+
+    difsr = np.ascontiguousarray(np.sqrt(S_part / K_part), dtype=np.float64)
+    rad_pow = np.ascontiguousarray(np.power(rad, nu), dtype=np.float64)
+    gamma_1_minus_nu = float(gamma(1.0 - nu))
+    two_over_gamma_nu = 2.0 / float(gamma(nu))
 
-    # if there is a homgeneouse aquifer, compute the result by hand
     if parts == 1:
-        # initialize the equation system
-        V = np.zeros(2, dtype=float)
-        M = np.array([[-gamma(1 - nu), 2.0 / gamma(nu)], [0, 1]])
-
-        for si, se in enumerate(s):
-            Cs = np.sqrt(se) * diff_sr0
-            # set the pumping-condition at the well
-            V[0] = Qs[si]
-            # incorporate the boundary-conditions
-            if R_part[0] > 0.0:
-                M[0, :] = [-kv(nu1, Cs * R_part[0]), iv(nu1, Cs * R_part[0])]
-            if R_part[-1] < np.inf:
-                M[1, :] = [kv(nu, Cs * R_part[-1]), iv(nu, Cs * R_part[-1])]
-            else:
-                M[0, 1] = 0  # Bs is 0 in this case either way
-            # solve the equation system
-            As, Bs = np.linalg.solve(M, V)
-
-            # calculate the head
-            for ri, re in enumerate(rad):
-                if re < R_part[-1]:
-                    res[si, ri] = re**nu * (As * kv(nu, Cs * re) + Bs * iv(nu, Cs * re))
-
-    # if there is more than one partition, create an equation system
+        _solve_homogeneous(
+            s,
+            rad,
+            rad_pow,
+            float(R_part[0]),
+            float(R_part[-1]),
+            diff_sr0,
+            nu,
+            nu1,
+            gamma_1_minus_nu,
+            two_over_gamma_nu,
+            qs,
+            cut_off_prec,
+            res,
+        )
     else:
-        # initialize LHS and RHS for the linear equation system
-        # Mb is the banded matrix for the Eq-System
-        V = np.zeros(2 * parts)
-        Mb = np.zeros((5, 2 * parts))
-        X = np.zeros(2 * parts)
-        # set the standard boundary conditions for rwell=0.0 and rinf=np.inf
-        Mb[2, 0] = -gamma(1 - nu)
-        Mb[1, 1] = 2.0 / gamma(nu)
-        Mb[2, -1] = 1.0
-
-        # calculate the consecutive fractions of the conductivities
-        Kfrac = K_part[:-1] / K_part[1:]
-        # calculate the square-root of the diffusivities
-        difsr = np.sqrt(S_part / K_part)
-        # calculate a temporal substitution (factor from mass-conservation)
-        tmp = Kfrac * difsr[:-1] / difsr[1:]
-        # match the radii to the different disks
-        pos = np.searchsorted(R_part, rad) - 1
-
-        # iterate over the laplace-variable
-        for si, se in enumerate(s):
-            Cs = np.sqrt(se) * difsr
-
-            # set the pumping-condition at the well
-            # --> implement other pumping conditions
-            V[0] = Qs[si]
-
-            # generate the equation system as banded matrix
-            for i in range(parts - 1):
-                Mb[0, 2 * i + 3] = -iv(nu, Cs[i + 1] * R_part[i + 1])
-                Mb[1, 2 * i + 2 : 2 * i + 4] = [
-                    -kv(nu, Cs[i + 1] * R_part[i + 1]),
-                    -iv(nu1, Cs[i + 1] * R_part[i + 1]),
-                ]
-                Mb[2, 2 * i + 1 : 2 * i + 3] = [
-                    iv(nu, Cs[i] * R_part[i + 1]),
-                    kv(nu1, Cs[i + 1] * R_part[i + 1]),
-                ]
-                Mb[3, 2 * i : 2 * i + 2] = [
-                    kv(nu, Cs[i] * R_part[i + 1]),
-                    tmp[i] * iv(nu1, Cs[i] * R_part[i + 1]),
-                ]
-                Mb[4, 2 * i] = -tmp[i] * kv(nu1, Cs[i] * R_part[i + 1])
-
-            # set the boundary-conditions if needed
-            if R_part[0] > 0.0:
-                Mb[2, 0] = -kv(nu1, Cs[0] * R_part[0])
-                Mb[1, 1] = iv(nu1, Cs[0] * R_part[0])
-            if R_part[-1] < np.inf:
-                Mb[-2, -2] = kv(nu, Cs[-1] * R_part[-1])
-                Mb[2, -1] = iv(nu, Cs[-1] * R_part[-1])
-            else:  # erase the last row, since X[-1] will be 0
-                Mb[0, -1] = 0
-                Mb[1, -1] = 0
-
-            # find first disk which has no impact
-            Mb_cond = np.max(np.abs(Mb), axis=0)
-            Mb_cond_lo = Mb_cond < cut_off_prec
-            Mb_cond_hi = Mb_cond > 1 / cut_off_prec
-            Mb_cond = np.logical_or(Mb_cond_lo, Mb_cond_hi)
-            cond = np.where(Mb_cond)[0]
-            found = cond.shape[0] > 0
-            first = cond[0] // 2 if found else parts
-
-            # initialize coefficients
-            X[2 * first :] = 0.0
-            # only the first disk has an impact
-            if first <= 1:
-                M_sgl = np.eye(2, dtype=float)
-                M_sgl[:, 0] = Mb[2:4, 0]
-                M_sgl[:, 1] = Mb[1:3, 1]
-                # solve the equation system
-                try:
-                    X[:2] = np.linalg.solve(M_sgl, V[:2])
-                except np.linalg.LinAlgError:
-                    # set 0 if matrix singular
-                    X[:2] = 0
-            elif first > 1:
-                # shrink the matrix
-                M_sgl = Mb[:, : 2 * first]
-                if first < parts:
-                    M_sgl[-1, -1] = 0
-                    M_sgl[-2, -1] = 0
-                    M_sgl[-1, -2] = 0
-                X[: 2 * first] = solve(
-                    M_sgl, V[: 2 * first], is_flat=True, index_row_wise=False
-                )
-            np.nan_to_num(X, copy=False)
-
-            # calculate the head (ignore small values)
-            with warnings.catch_warnings():
-                warnings.simplefilter("ignore", RuntimeWarning)
-                k0_sub = X[2 * pos] * kv(nu, Cs[pos] * rad)
-                k0_sub[np.abs(X[2 * pos]) < cut_off_prec] = 0
-                i0_sub = X[2 * pos + 1] * iv(nu, Cs[pos] * rad)
-                i0_sub[np.abs(X[2 * pos + 1]) < cut_off_prec] = 0
-                res[si, :] = rad**nu * (k0_sub + i0_sub)
-
-    # set problematic values to 0
-    # --> the algorithm tends to violate small values,
-    #     therefore this approach is suitable
+        tmp = np.ascontiguousarray(
+            (K_part[:-1] / K_part[1:]) * (difsr[:-1] / difsr[1:]),
+            dtype=np.float64,
+        )
+        pos = np.ascontiguousarray(
+            np.searchsorted(R_part, rad, side="left") - 1, dtype=np.intp
+        )
+        _solve_multilayer(
+            s,
+            rad,
+            rad_pow,
+            R_part,
+            difsr,
+            tmp,
+            pos,
+            nu,
+            nu1,
+            gamma_1_minus_nu,
+            two_over_gamma_nu,
+            qs,
+            cut_off_prec,
+            res,
+        )
+
     np.nan_to_num(res, copy=False)
-    # scale to pumpingrate
     res *= rate / (K_well * sph_surf(dim) * lat_ext ** (3.0 - dim))
-
     return res