diff --git a/examples/input.nn.recommended b/examples/input.nn.recommended
index e53934e0f..a0030bf52 100644
--- a/examples/input.nn.recommended
+++ b/examples/input.nn.recommended
@@ -35,6 +35,7 @@ parallel_mode                   0              # Training parallelization used (
 jacobian_mode                   1              # Jacobian computation mode (0 = Summation to single gradient, 1 = Per-task summed gradient, 2 = Full Jacobian).
 update_strategy                 0              # Update strategy (0 = Combined, 1 = Per-element).
 selection_mode                  2              # Update candidate selection mode (0 = Random, 1 = Sort, 2 = Threshold).
+decoupling_type                 0              # Kalman filter decoupling type (0 = GEKF, 1 = ED-GEKF, 2 = Layer, 3 = NDEKF, 4 = FDEKF).
 task_batch_size_energy          1              # Number of energy update candidates prepared per task for each update (0 = Entire training set).
 task_batch_size_force           1              # Number of force update candidates prepared per task for each update (0 = Entire training set).
 memorize_symfunc_results                       # Keep symmetry function results in memory.
diff --git a/examples/nnp-train/Cu2S_PBE/input.nn b/examples/nnp-train/Cu2S_PBE/input.nn
index b4aa9333e..23eccf6db 100644
--- a/examples/nnp-train/Cu2S_PBE/input.nn
+++ b/examples/nnp-train/Cu2S_PBE/input.nn
@@ -44,6 +44,7 @@ parallel_mode                   0              # Training parallelization used (
 jacobian_mode                   1              # Jacobian computation mode (0 = Summation to single gradient, 1 = Per-task summed gradient, 2 = Full Jacobian).
 update_strategy                 0              # Update strategy (0 = Combined, 1 = Per-element).
 selection_mode                  2              # Update candidate selection mode (0 = Random, 1 = Sort, 2 = Threshold).
+decoupling_type                 0              # Kalman filter decoupling type (0 = GEKF, 1 = ED-GEKF, 2 = Layer, 3 = NDEKF, 4 = FDEKF).
 task_batch_size_energy          1              # Number of energy update candidates prepared per task for each update (0 = Entire training set).
 task_batch_size_force           1              # Number of force update candidates prepared per task for each update (0 = Entire training set).
 memorize_symfunc_results                       # Keep symmetry function results in memory.
diff --git a/examples/nnp-train/H2O_RPBE-D3/input.nn b/examples/nnp-train/H2O_RPBE-D3/input.nn
index b2666eecd..05b3fcacb 100644
--- a/examples/nnp-train/H2O_RPBE-D3/input.nn
+++ b/examples/nnp-train/H2O_RPBE-D3/input.nn
@@ -44,6 +44,7 @@ parallel_mode                   0              # Training parallelization used (
 jacobian_mode                   1              # Jacobian computation mode (0 = Summation to single gradient, 1 = Per-task summed gradient, 2 = Full Jacobian).
 update_strategy                 0              # Update strategy (0 = Combined, 1 = Per-element).
 selection_mode                  2              # Update candidate selection mode (0 = Random, 1 = Sort, 2 = Threshold).
+decoupling_type                 0              # Kalman filter decoupling type (0 = GEKF, 1 = ED-GEKF, 2 = Layer, 3 = NDEKF, 4 = FDEKF).
 task_batch_size_energy          1              # Number of energy update candidates prepared per task for each update (0 = Entire training set).
 task_batch_size_force           1              # Number of force update candidates prepared per task for each update (0 = Entire training set).
 memorize_symfunc_results                       # Keep symmetry function results in memory.
diff --git a/examples/nnp-train/LJ/input.nn b/examples/nnp-train/LJ/input.nn
index d42118f4b..2a1812341 100644
--- a/examples/nnp-train/LJ/input.nn
+++ b/examples/nnp-train/LJ/input.nn
@@ -34,6 +34,7 @@ parallel_mode                   0              # Training parallelization used (
 jacobian_mode                   1              # Jacobian computation mode (0 = Summation to single gradient, 1 = Per-task summed gradient, 2 = Full Jacobian).
 update_strategy                 0              # Update strategy (0 = Combined, 1 = Per-element).
 selection_mode                  2              # Update candidate selection mode (0 = Random, 1 = Sort, 2 = Threshold).
+decoupling_type                 0              # Kalman filter decoupling type (0 = GEKF, 1 = ED-GEKF, 2 = Layer, 3 = NDEKF, 4 = FDEKF).
 task_batch_size_energy          1              # Number of energy update candidates prepared per task for each update (0 = Entire training set).
 task_batch_size_force           1              # Number of force update candidates prepared per task for each update (0 = Entire training set).
 memorize_symfunc_results                       # Keep symmetry function results in memory.
diff --git a/src/libnnp/Mode.cpp b/src/libnnp/Mode.cpp
index 475ec8ff3..bd7359734 100644
--- a/src/libnnp/Mode.cpp
+++ b/src/libnnp/Mode.cpp
@@ -884,7 +884,9 @@ void Mode::setupNeuralNetworkWeights(string const& fileNameFormat)
                 weights.push_back(atof(splitLine.at(0).c_str()));
             }
         }
-        it->neuralNetwork->setConnections(&(weights.front()));
+        // Attention: need alternative (old) ordering scheme here for
+        // backward compatibility!
+        it->neuralNetwork->setConnectionsAO(&(weights.front()));
         file.close();
     }
 
diff --git a/src/libnnp/NeuralNetwork.cpp b/src/libnnp/NeuralNetwork.cpp
index e7b7808c6..ea812a336 100644
--- a/src/libnnp/NeuralNetwork.cpp
+++ b/src/libnnp/NeuralNetwork.cpp
@@ -80,12 +80,26 @@ NeuralNetwork(int                              numLayers,
         weightOffset[i] = weightOffset[i-1] +
                           (layers[i-1].numNeurons + 1) * layers[i].numNeurons;
     }
+#ifndef ALTERNATIVE_WEIGHT_ORDERING
+    biasOffset = new int*[numLayers-1];
+    for (int i = 0; i < numLayers-1; i++)
+    {
+        biasOffset[i] = new int[layers[i+1].numNeurons];
+        for (int j = 0; j < layers[i+1].numNeurons; j++)
+        {
+            biasOffset[i][j] = weightOffset[i]
+                             + (layers[i+1].numNeuronsPrevLayer + 1)
+                             * (j + 1) - 1;
+        }
+    }
+#else
     biasOffset = new int[numLayers-1];
     for (int i = 0; i < numLayers-1; i++)
     {
         biasOffset[i] = weightOffset[i] +
                         layers[i+1].numNeurons * layers[i].numNeurons;
     }
+#endif
     biasOnlyOffset = new int[numLayers-1];
     biasOnlyOffset[0] = 0;
     for (int i = 1; i < numLayers-1; i++)
@@ -106,7 +120,15 @@ NeuralNetwork::~NeuralNetwork()
     }
     delete[] layers;
     delete[] weightOffset;
+#ifndef ALTERNATIVE_WEIGHT_ORDERING
+    for (int i = 0; i < numLayers - 1; i++)
+    {
+        delete[] biasOffset[i];
+    }
+    delete[] biasOffset;
+#else
     delete[] biasOffset;
+#endif
     delete[] biasOnlyOffset;
 }
 
@@ -148,6 +170,27 @@ void NeuralNetwork::setConnections(double const* const& connections)
 {
     int count = 0;
 
+    for (int i = 1; i < numLayers; i++)
+    {
+        for (int j = 0; j < layers[i].numNeurons; j++)
+        {
+            for (int k = 0; k < layers[i].numNeuronsPrevLayer; k++)
+            {
+                layers[i].neurons[j].weights[k] = connections[count];
+                count++;
+            }
+            layers[i].neurons[j].bias = connections[count];
+            count++;
+        }
+    }
+
+    return;
+}
+
+void NeuralNetwork::setConnectionsAO(double const* const& connections)
+{
+    int count = 0;
+
     for (int i = 1; i < numLayers; i++)
     {
         for (int j = 0; j < layers[i].numNeuronsPrevLayer; j++)
@@ -172,6 +215,27 @@ void NeuralNetwork::getConnections(double* connections) const
 {
     int count = 0;
 
+    for (int i = 1; i < numLayers; i++)
+    {
+        for (int j = 0; j < layers[i].numNeurons; j++)
+        {
+            for (int k = 0; k < layers[i].numNeuronsPrevLayer; k++)
+            {
+                connections[count] = layers[i].neurons[j].weights[k] ;
+                count++;
+            }
+            connections[count] = layers[i].neurons[j].bias;
+            count++;
+        }
+    }
+
+    return;
+}
+
+void NeuralNetwork::getConnectionsAO(double* connections) const
+{
+    int count = 0;
+
     for (int i = 1; i < numLayers; i++)
     {
         for (int j = 0; j < layers[i].numNeuronsPrevLayer; j++)
@@ -428,6 +492,52 @@ void NeuralNetwork::calculateDEdG(double *dEdG) const
 
 void NeuralNetwork::calculateDEdc(double* dEdc) const
 {
+#ifndef ALTERNATIVE_WEIGHT_ORDERING
+    int count = 0;
+
+    for (int i = 0; i < numConnections; i++)
+    {
+        dEdc[i] = 0.0;
+    }
+
+    for (int i = 0; i < outputLayer->numNeurons; i++)
+    {
+        dEdc[biasOffset[numLayers-2][i]] = outputLayer->neurons[i].dfdx;
+        if (normalizeNeurons)
+        {
+            dEdc[biasOffset[numLayers-2][i]] /=
+                outputLayer->numNeuronsPrevLayer;
+        }
+    }
+
+    for (int i = numLayers - 2; i >= 0; i--)
+    {
+        count = 0;
+        for (int j = 0; j < layers[i+1].numNeurons; j++)
+        {
+            for (int k = 0; k < layers[i].numNeurons; k++)
+            {
+                dEdc[weightOffset[i]+count] = dEdc[biasOffset[i][j]]
+                                            * layers[i].neurons[k].value;
+                count++;
+                if (i >= 1)
+                {
+                    dEdc[biasOffset[i-1][k]] += dEdc[biasOffset[i][j]]
+                        * layers[i+1].neurons[j].weights[k]
+                        * layers[i].neurons[k].dfdx;
+                }
+            }
+            count++;
+        }
+        if (normalizeNeurons && i >= 1)
+        {
+            for (int k = 0; k < layers[i].numNeurons; k++)
+            {
+                dEdc[biasOffset[i-1][k]] /= layers[i].numNeuronsPrevLayer;
+            }
+        }
+    }
+#else
     int count = 0;
 
     for (int i = 0; i < numConnections; i++)
@@ -468,6 +578,7 @@ void NeuralNetwork::calculateDEdc(double* dEdc) const
             }
         }
     }
+#endif
 
     return;
 }
@@ -512,6 +623,67 @@ void NeuralNetwork::calculateDFdc(double*              dFdc,
 }
 
 void NeuralNetwork::writeConnections(std::ofstream& file) const
+{
+    // File header.
+    vector<string> title;
+    vector<string> colName;
+    vector<string> colInfo;
+    vector<size_t> colSize;
+    title.push_back("Neural network connection values (weights and biases).");
+    colSize.push_back(24);
+    colName.push_back("connection");
+    colInfo.push_back("Neural network connection value.");
+    colSize.push_back(1);
+    colName.push_back("t");
+    colInfo.push_back("Connection type (a = weight, b = bias).");
+    colSize.push_back(9);
+    colName.push_back("index");
+    colInfo.push_back("Index enumerating weights.");
+    colSize.push_back(5);
+    colName.push_back("l_s");
+    colInfo.push_back("Starting point layer (end point layer for biases).");
+    colSize.push_back(5);
+    colName.push_back("n_s");
+    colInfo.push_back("Starting point neuron in starting layer (end point "
+                      "neuron for biases).");
+    colSize.push_back(5);
+    colName.push_back("l_e");
+    colInfo.push_back("End point layer.");
+    colSize.push_back(5);
+    colName.push_back("n_e");
+    colInfo.push_back("End point neuron in end layer.");
+    appendLinesToFile(file,
+                      createFileHeader(title, colSize, colName, colInfo));
+
+    int count = 0;
+    for (int i = 1; i < numLayers; i++)
+    {
+        for (int j = 0; j < layers[i].numNeurons; j++)
+        {
+            for (int k = 0; k < layers[i].numNeuronsPrevLayer; k++)
+            {
+                count++;
+                file << strpr("%24.16E a %9d %5d %5d %5d %5d\n",
+                              layers[i].neurons[j].weights[k],
+                              count,
+                              i - 1,
+                              k + 1,
+                              i,
+                              j + 1);
+            }
+            count++;
+            file << strpr("%24.16E b %9d %5d %5d\n",
+                          layers[i].neurons[j].bias,
+                          count,
+                          i,
+                          j + 1);
+        }
+    }
+
+    return;
+}
+
+void NeuralNetwork::writeConnectionsAO(std::ofstream& file) const
 {
     // File header.
     vector<string> title;
@@ -645,6 +817,67 @@ void NeuralNetwork::calculateD2EdGdc(int                  index,
                                      double const* const& dEdb,
                                      double*              d2EdGdc) const
 {
+#ifndef ALTERNATIVE_WEIGHT_ORDERING
+    int count = 0;
+
+    for (int i = 0; i < outputLayer->numNeurons; i++)
+    {
+        d2EdGdc[biasOffset[numLayers-2][i]] = outputLayer->neurons[i].d2fdx2
+                                            * outputLayer->neurons[i].dxdG;
+        if (normalizeNeurons)
+        {
+            d2EdGdc[biasOffset[numLayers-2][i]] /=
+                outputLayer->numNeuronsPrevLayer;
+        }
+    }
+
+    for (int i = numLayers-2; i >= 0; i--)
+    {
+        count = 0;
+        for (int j = 0; j < layers[i+1].numNeurons; j++)
+        {
+            for (int k = 0; k < layers[i].numNeurons; k++)
+            {
+                if (i == 0)
+                {
+                    d2EdGdc[weightOffset[i]+count] = d2EdGdc[biasOffset[i][j]]
+                        * layers[i].neurons[k].value;
+                    if (k == index)
+                    {
+                        d2EdGdc[weightOffset[i]+count] +=
+                            dEdb[biasOnlyOffset[i]+j];
+                    }
+                }
+                else
+                {
+                    d2EdGdc[weightOffset[i]+count] = d2EdGdc[biasOffset[i][j]]
+                        * layers[i].neurons[k].value
+                        + dEdb[biasOnlyOffset[i]+j] * layers[i].neurons[k].dfdx
+                        * layers[i].neurons[k].dxdG;
+                }
+                count++;
+                if (i >= 1)
+                {
+                    d2EdGdc[biasOffset[i-1][k]] +=
+                        layers[i+1].neurons[j].weights[k]
+                        * (d2EdGdc[biasOffset[i][j]]
+                        * layers[i].neurons[k].dfdx
+                        + dEdb[biasOnlyOffset[i]+j]
+                        * layers[i].neurons[k].d2fdx2
+                        * layers[i].neurons[k].dxdG);
+                }
+            }
+            count++;
+        }
+        for (int k = 0; k < layers[i].numNeurons; k++)
+        {
+            if (normalizeNeurons && i >= 1)
+            {
+                d2EdGdc[biasOffset[i-1][k]] /= layers[i].numNeuronsPrevLayer;
+            }
+        }
+    }
+#else
     int count = 0;
 
     for (int i = 0; i < outputLayer->numNeurons; i++)
@@ -699,6 +932,7 @@ void NeuralNetwork::calculateD2EdGdc(int                  index,
             }
         }
     }
+#endif
 
     return;
 }
@@ -921,6 +1155,45 @@ void NeuralNetwork::getNeuronStatistics(long*   count,
     return;
 }
 
+vector<pair<size_t, size_t>> NeuralNetwork::getLayerLimits() const
+{
+    vector<pair<size_t, size_t>> limits;
+
+    for (int i = 0; i < numLayers - 2; ++i)
+    {
+        limits.push_back(make_pair(weightOffset[i],
+                                       weightOffset[i + 1] - 1));
+    }
+    limits.push_back(make_pair(weightOffset[numLayers - 2],
+                                   numConnections - 1));
+
+    return limits;
+}
+
+vector<pair<size_t, size_t>> NeuralNetwork::getNeuronLimits() const
+{
+    vector<pair<size_t, size_t>> limits;
+#ifndef ALTERNATIVE_WEIGHT_ORDERING
+    for (int i = 0; i < numLayers - 1; ++i)
+    {
+        limits.push_back(make_pair(weightOffset[i],
+                                       biasOffset[i][0]));
+        for (int j = 0; j < layers[i+1].numNeurons - 1; ++j)
+        {
+            limits.push_back(make_pair(biasOffset[i][j] + 1,
+                                           biasOffset[i][j+1]));
+
+        }
+    }
+#else
+    throw runtime_error("ERROR: Neuron boundaries not available with "
+                        "alternative (old) weight memory layout, recompile "
+                        "without -DALTERNATIVE_WEIGHT_ORDERING.\n");
+#endif
+
+    return limits;
+}
+
 /*
 void NeuralNetwork::writeStatus(int element, int epoch)
 {
@@ -969,7 +1242,15 @@ long NeuralNetwork::getMemoryUsage()
     int  numNeurons = getNumNeurons();
 
     mem += (numLayers - 1) * sizeof(int); // weightOffset
+#ifndef ALTERNATIVE_WEIGHT_ORDERING
+    mem += (numLayers - 1) * sizeof(int*); // biasOffset
+    for (int i = 0; i < numLayers - 1; i++)
+    {
+        mem += layers[i].numNeurons * sizeof(int);
+    }
+#else
     mem += (numLayers - 1) * sizeof(int); // biasOffset
+#endif
     mem += (numLayers - 1) * sizeof(int); // biasOnlyOffset
     mem += numLayers  * sizeof(Layer);    // layers
     mem += numNeurons * sizeof(Neuron);   // neurons
diff --git a/src/libnnp/NeuralNetwork.h b/src/libnnp/NeuralNetwork.h
index c56c58e9f..8cbef0cfd 100644
--- a/src/libnnp/NeuralNetwork.h
+++ b/src/libnnp/NeuralNetwork.h
@@ -17,8 +17,10 @@
 #ifndef NEURALNETWORK_H
 #define NEURALNETWORK_H
 
+#include <cstddef>   // std::size_t
 #include <fstream>   // std::ofstream
 #include <string>    // std::string
+#include <utility>   // std::pair
 #include <vector>    // std::vector
 
 namespace nnp
@@ -152,29 +154,70 @@ class NeuralNetwork
      *            @f[
      *              \underbrace{
      *                \overbrace{
-     *                  a^{01}_{00}, \ldots, a^{01}_{0m_1},
-     *                  a^{01}_{10}, \ldots, a^{01}_{1m_1},
+     *                  a^{01}_{00}, \ldots, a^{01}_{n_00}, b^{1}_{0}
+     *                }^{\text{Neuron}^{1}_{0}}
+     *                \overbrace{
+     *                  a^{01}_{01}, \ldots, a^{01}_{n_01}, b^{1}_{1}
+     *                }^{\text{Neuron}^{1}_{1}}
+     *                  \ldots,
+     *                \overbrace{
+     *                  a^{01}_{0n_1}, \ldots, a^{01}_{n_0n_1}, b^{1}_{n_1}
+     *                }^{\text{Neuron}^{1}_{n_1}}
+     *              }_{\text{Layer } 0 \rightarrow 1},
+     *              \underbrace{
+     *                a^{12}_{00}, \ldots, b^{2}_{n_2}
+     *              }_{\text{Layer } 1 \rightarrow 2},
+     *              \ldots,
+     *              \underbrace{
+     *                a^{p-1,p}_{00}, \ldots, b^{p}_{n_p}
+     *              }_{\text{Layer } p-1 \rightarrow p}
+     *            @f]
+     *            where @f$a^{i-1, i}_{jk}@f$ is the weight connecting neuron
+     *            @f$j@f$ in layer @f$i-1@f$ to neuron @f$k@f$ in layer
+     *            @f$i@f$ and @f$b^{i}_{k}@f$ is the bias assigned to neuron
+     *            @f$k@f$ in layer @f$i@f$.
+     *
+     *  This ordering scheme is used internally to store weights in memory.
+     *  Because all weights and biases connected to a neuron are aligned in a
+     *  continuous block this memory layout simplifies the implementation of
+     *  node-decoupled Kalman filter training (NDEKF). However, for backward
+     *  compatibility reasons this layout is NOT used for storing weights on
+     *  disk (see setConnectionsAO()).
+     */
+    void                     setConnections(double const* const& connections);
+    /** Set neural network weights and biases (alternative ordering)
+     *
+     * @param[in] connections One-dimensional array with neural network
+     *                        connections in the following order:
+     *            @f[
+     *              \underbrace{
+     *                \overbrace{
+     *                  a^{01}_{00}, \ldots, a^{01}_{0n_1},
+     *                  a^{01}_{10}, \ldots, a^{01}_{1n_1},
      *                  \ldots,
-     *                  a^{01}_{n_00}, \ldots, a^{01}_{n_0m_1},
+     *                  a^{01}_{n_00}, \ldots, a^{01}_{n_0n_1},
      *                }^{\text{Weights}}
      *                \overbrace{
-     *                  b^{1}_{0}, \ldots, b^{1}_{m_1}
+     *                  b^{1}_{0}, \ldots, b^{1}_{n_1}
      *                }^{\text{Biases}}
      *              }_{\text{Layer } 0 \rightarrow 1},
      *              \underbrace{
-     *                a^{12}_{00}, \ldots, b^{2}_{m_2}
+     *                a^{12}_{00}, \ldots, b^{2}_{n_2}
      *              }_{\text{Layer } 1 \rightarrow 2},
      *              \ldots,
      *              \underbrace{
-     *                a^{p-1,p}_{00}, \ldots, b^{p}_{m_p}
+     *                a^{p-1,p}_{00}, \ldots, b^{p}_{n_p}
      *              }_{\text{Layer } p-1 \rightarrow p}
      *            @f]
      *            where @f$a^{i-1, i}_{jk}@f$ is the weight connecting neuron
      *            @f$j@f$ in layer @f$i-1@f$ to neuron @f$k@f$ in layer
      *            @f$i@f$ and @f$b^{i}_{k}@f$ is the bias assigned to neuron
      *            @f$k@f$ in layer @f$i@f$.
+     *
+     *  This memory layout is used when storing weights on disk.
      */
-    void                     setConnections(double const* const& connections);
+    void                     setConnectionsAO(
+                                             double const* const& connections);
     /** Get neural network weights and biases.
      *
      * @param[out] connections One-dimensional array with neural network
@@ -182,6 +225,13 @@ class NeuralNetwork
      *                         #setConnections())
      */
     void                     getConnections(double* connections) const;
+    /** Get neural network weights and biases (alternative ordering).
+     *
+     * @param[out] connections One-dimensional array with neural network
+     *                         connections (same order as described in
+     *                         #setConnectionsAO())
+     */
+    void                     getConnectionsAO(double* connections) const;
     /** Initialize connections with random numbers.
      *
      * @param[in] seed Random number generator seed.
@@ -211,14 +261,14 @@ class NeuralNetwork
                                          ModificationScheme modificationScheme,
                                          double             parameter1,
                                          double             parameter2);
-    /** Set neural network input layer node values.
+    /** Set neural network input layer neuron values.
      *
-     * @param[in] input Input layer node values.
+     * @param[in] input Input layer neuron values.
      */
     void                     setInput(double const* const& input) const;
-    /** Get neural network output layer node values.
+    /** Get neural network output layer neuron values.
      *
-     * @param[out] output Output layer node values.
+     * @param[out] output Output layer neuron values.
      */
     void                     getOutput(double* output) const;
     /** Propagate input information through all layers.
@@ -295,10 +345,20 @@ class NeuralNetwork
     void                     calculateDFdc(double*              dFdc,
                                            double const* const& dGdxyz) const;
     /** Write connections to file.
+     *
+     * __CAUTION__: For compatibility reasons this format is NOT used for
+     * storing NN weights to disk!
      *
      * @param[in,out] file File stream to write to.
      */
     void                     writeConnections(std::ofstream& file) const;
+    /** Write connections to file (alternative ordering).
+     *
+     * This NN weights ordering layout is used when writing weights to disk.
+     *
+     * @param[in,out] file File stream to write to.
+     */
+    void                     writeConnectionsAO(std::ofstream& file) const;
     /** Return gathered neuron statistics.
      *
      * @param[out] count Number of neuron output value calculations.
@@ -333,6 +393,24 @@ class NeuralNetwork
      * Counters and summation variables for neuron statistics are reset.
      */
     void                     resetNeuronStatistics();
+    /** Get layer limits in combined weight vector (for Kalman filter
+     * decoupling setup).
+     *
+     * @return Vector with layer limits as pair (begin, end),
+     * (numLayers - 1) points.
+     */
+    std::vector<std::pair<
+    std::size_t,
+    std::size_t>>            getLayerLimits() const;
+    /** Get neuron limits in combined weight vector (for Kalman filter
+     * decoupling setup).
+     *
+     * @return Vector with neuron limits as pair (begin, end),
+     * (numNeurons - inputLayer->numNeurons) points.
+     */
+    std::vector<std::pair<
+    std::size_t,
+    std::size_t>>            getNeuronLimits() const;
     //void   writeStatus(int, int);
     long                     getMemoryUsage();
     /** Print neural network architecture.
@@ -404,8 +482,13 @@ class NeuralNetwork
     int    numHiddenLayers;
     /// Offset adress of weights per layer in combined weights+bias array.
     int*   weightOffset;
+#ifndef ALTERNATIVE_WEIGHT_ORDERING
+    /// Offset adress of each neuron per layer in combined weights+bias array.
+    int**  biasOffset;
+#else
     /// Offset adress of biases per layer in combined weights+bias array.
     int*   biasOffset;
+#endif
     /// Offset adress of biases per layer in bias only array.
     int*   biasOnlyOffset;
     /// Pointer to input layer.
diff --git a/src/libnnp/Settings.cpp b/src/libnnp/Settings.cpp
index 68c0638d8..7f6a2c2ca 100644
--- a/src/libnnp/Settings.cpp
+++ b/src/libnnp/Settings.cpp
@@ -73,6 +73,7 @@ map<string, string> const createKnownKeywordsMap()
     m["jacobian_mode"                 ] = "";
     m["update_strategy"               ] = "";
     m["selection_mode"                ] = "";
+    m["decoupling_type"               ] = "";
     m["task_batch_size_energy"        ] = "";
     m["task_batch_size_force"         ] = "";
     m["gradient_type"                 ] = "";
diff --git a/src/libnnptrain/KalmanFilter.cpp b/src/libnnptrain/KalmanFilter.cpp
index ab55a8419..5d02b44c2 100644
--- a/src/libnnptrain/KalmanFilter.cpp
+++ b/src/libnnptrain/KalmanFilter.cpp
@@ -16,9 +16,11 @@
 
 #include "KalmanFilter.h"
 #include "utility.h"
+#include "mpi-extra.h"
 #include <Eigen/LU>
 #include <iostream>
 #include <stdexcept>
+#include <map>       // std::map
 
 using namespace Eigen;
 using namespace std;
@@ -27,8 +29,11 @@ using namespace nnp;
 KalmanFilter::KalmanFilter(size_t const sizeState,
                            KalmanType const type) :
     Updater(sizeState),
+    myRank         (0   ),
+    numProcs       (0   ),
     sizeObservation(0   ),
     numUpdates     (0   ),
+    sizeP          (0   ),
     epsilon        (0.0 ),
     q              (0.0 ),
     q0             (0.0 ),
@@ -62,17 +67,130 @@ KalmanFilter::KalmanFilter(size_t const sizeState,
     w  = new Map<VectorXd      >(0, sizeState);
     xi = new Map<VectorXd const>(0, sizeObservation);
     H  = new Map<MatrixXd const>(0, sizeState, sizeObservation);
-    P.resize(sizeState, sizeState);
-    P.setIdentity();
-    // Prevent problems with unallocated K when log starts.
-    K.resize(sizeState, sizeObservation);
-    K.setZero();
 }
 
 KalmanFilter::~KalmanFilter()
 {
 }
 
+void KalmanFilter::setupMPI(MPI_Comm* communicator)
+{
+    MPI_Comm_dup(*communicator, &comm);
+    MPI_Comm_rank(comm, &myRank);
+    MPI_Comm_size(comm, &numProcs);
+
+    return;
+}
+
+void KalmanFilter::setupDecoupling(vector<pair<size_t, size_t>> groupLimits)
+{
+    this->groupLimits = groupLimits;
+
+    // Check consistency of decoupling group limits.
+    if (groupLimits.at(0).first != 0)
+    {
+        auto const& l = groupLimits.at(0);
+        throw runtime_error(strpr("ERROR: Inconsistent decoupling group "
+                                  "limits, first group must start with index "
+                                  "0 (is %zu).\n",
+                                  l.first));
+
+    }
+    for (size_t i = 0; i < groupLimits.size(); ++i)
+    {
+        auto const& l = groupLimits.at(i);
+        if (l.first > l.second)
+        {
+            throw runtime_error(
+                strpr("ERROR: Inconsistent decoupling group limits, "
+                      "group %zu: start %zu > end %zu.\n",
+                      i, l.first, l.second));
+        }
+    }
+    for (size_t i = 0; i < groupLimits.size() - 1; ++i)
+    {
+        auto const& l = groupLimits.at(i);
+        auto const& r = groupLimits.at(i + 1);
+        if (l.second + 1 != r.first)
+        {
+            throw runtime_error(
+                strpr("ERROR: Inconsistent decoupling group limits, "
+                      "group %zu end %zu + 1 != group %zu start %zu.\n",
+                      i, l.second, i + 1, r.first));
+        }
+    }
+    if (groupLimits.back().second != sizeState - 1)
+    {
+        auto const& l = groupLimits.back();
+        throw runtime_error(strpr("ERROR: Inconsistent decoupling group "
+                                  "limits, last group must end with index "
+                                  "%zu (is %zu).\n",
+                                  sizeState - 1,
+                                  l.second));
+
+    }
+
+    // Distribute groups to MPI procs.
+    numGroupsPerProc.resize(numProcs, 0);
+    groupOffsetPerProc.resize(numProcs, 0);
+    int quotient = groupLimits.size() / numProcs;
+    int remainder = groupLimits.size() % numProcs;
+    int groupSum = 0;
+    for (int i = 0; i < numProcs; ++i)
+    {
+        numGroupsPerProc.at(i) = quotient;
+        if (remainder > 0 && i < remainder)
+        {
+            numGroupsPerProc.at(i)++;
+        }
+        groupOffsetPerProc.at(i) = groupSum;
+        groupSum += numGroupsPerProc.at(i);
+    }
+
+    // Compute state vector segment length and offset for each processor.
+    for (int i = 0; i < numProcs - 1; ++i)
+    {
+        sizeStatePerProc.push_back(
+            groupLimits.at(groupOffsetPerProc.at(i+1)).first -
+            groupLimits.at(groupOffsetPerProc.at(i)).first);
+        offsetStatePerProc.push_back(
+            groupLimits.at(groupOffsetPerProc.at(i)).first);
+    }
+    sizeStatePerProc.push_back(
+        groupLimits.back().second + 1 -
+        groupLimits.at(groupOffsetPerProc.at(numProcs - 1)).first);
+    offsetStatePerProc.push_back(
+        groupLimits.at(groupOffsetPerProc.at(numProcs - 1)).first);
+
+    // Store group index and segment length for this processor.
+    for (size_t i = 0; i < numGroupsPerProc.at(myRank); ++i)
+    {
+        size_t index = groupOffsetPerProc.at(myRank) + i;
+        size_t size = groupLimits.at(index).second
+                    - groupLimits.at(index).first + 1;
+        myGroups.push_back(make_pair(index, size));
+    }
+
+    // Allocate per-processor matrices for all groups.
+    for (auto g : myGroups)
+    {
+        P.push_back(Eigen::MatrixXd(g.second, g.second));
+        P.back().setIdentity();
+
+        X.push_back(Eigen::MatrixXd(g.second, sizeObservation));
+
+        // Prevent problems with unallocated K when log starts.
+        K.push_back(Eigen::MatrixXd(g.second, sizeObservation));
+        K.back().setZero();
+
+        sizeP += g.second * g.second;
+    }
+
+    MPI_Allreduce(MPI_IN_PLACE, &sizeP, 1, MPI_SIZE_T, MPI_SUM, comm);
+
+    return;
+}
+
 void KalmanFilter::setSizeObservation(size_t const size)
 {
     sizeObservation = size;
@@ -125,15 +243,32 @@ void KalmanFilter::update()
 
 void KalmanFilter::update(size_t const sizeObservation)
 {
-    X.resize(sizeState, sizeObservation);
+    MatrixXd A(sizeObservation, sizeObservation);
+    A.setZero();
 
-    // Calculate temporary result.
-    // X = P . H
-    X = P.selfadjointView<Lower>() * (*H);
+    for (size_t i = 0; i < myGroups.size(); ++i)
+    {
+        size_t const& j = myGroups.at(i).first;     // group index
+        size_t const& n = myGroups.at(i).second;    // group size
+        size_t const& s = groupLimits.at(j).first;  // group weight start index
+
+        auto& x = X.at(i);
+        auto& p = P.at(i);
+
+        x.resize(n, sizeObservation);
+
+        // Calculate temporary result.
+        // X = P . H
+        x = p.selfadjointView<Lower>() * H->block(s, 0, n, sizeObservation);
+ 
+        // Calculate scaling matrix.
+        // A = H^T . X
+        // For decoupling summarize scaling matrix here (locally).
+        A += H->block(s, 0, n, sizeObservation).transpose() * x;
+    }
 
-    // Calculate scaling matrix.
-    // A = H^T . X
-    MatrixXd A = H->transpose() * X;
+    // Summarize scaling matrix globally.
+    MPI_Allreduce(MPI_IN_PLACE, A.data(), A.size(), MPI_DOUBLE, MPI_SUM, comm);
 
     // Increase learning rate.
     // eta(n) = eta(0) * exp(n * tau)
@@ -150,38 +285,52 @@ void KalmanFilter::update(size_t const sizeObservation)
         A.diagonal() += VectorXd::Constant(sizeObservation, lambda);
     }
 
-    // Calculate Kalman gain matrix.
-    // K = X . A^-1
-    K.resize(sizeState, sizeObservation);
-    K = X * A.inverse();
-
-    // Update error covariance matrix.
-    // P = P - K . X^T
-    P.noalias() -= K * X.transpose();
-
-    // Apply forgetting factor.
-    if (type == KT_FADINGMEMORY)
+    for (size_t i = 0; i < myGroups.size(); ++i)
     {
-        P *= 1.0 / lambda;
+        size_t const& j = myGroups.at(i).first;     // group index
+        size_t const& n = myGroups.at(i).second;    // group size
+        size_t const& s = groupLimits.at(j).first;  // group weight start index
+
+        auto& x = X.at(i);
+        auto& p = P.at(i);
+        auto& k = K.at(i);
+
+        // Calculate Kalman gain matrix.
+        // K = X . A^-1
+        k.resize(n, sizeObservation);
+        k = x * A.inverse();
+
+        // Update error covariance matrix.
+        // P = P - K . X^T
+        p.noalias() -= k * x.transpose();
+
+        // Apply forgetting factor.
+        if (type == KT_FADINGMEMORY)
+        {
+            p *= 1.0 / lambda;
+        }
+        // Add process noise.
+        // P = P + Q
+        p.diagonal() += VectorXd::Constant(n, q);
+
+        // Update state vector.
+        // w =  w + K . xi
+        w->segment(s, n) += k * (*xi);
+
+        // Anneal process noise.
+        // q(n) = q(0) * exp(-n * tau)
+        if (q > qmin) q *= exp(-qtau);
+
+        // Update forgetting factor.
+        if (type == KT_FADINGMEMORY)
+        {
+            lambda = nu * lambda + 1.0 - nu;
+            gamma = 1.0 / (1.0 + lambda / gamma);
+        }
     }
-    // Add process noise.
-    // P = P + Q
-    P.diagonal() += VectorXd::Constant(sizeState, q);
-
-    // Update state vector.
-    // w =  w + K . xi
-    (*w) += K * (*xi);
 
-    // Anneal process noise.
-    // q(n) = q(0) * exp(-n * tau)
-    if (q > qmin) q *= exp(-qtau);
-
-    // Update forgetting factor.
-    if (type == KT_FADINGMEMORY)
-    {
-        lambda = nu * lambda + 1.0 - nu;
-        gamma = 1.0 / (1.0 + lambda / gamma);
-    }
+    // Communicate new weight segments.
+    MPI_Allgatherv(MPI_IN_PLACE, 0, MPI_DOUBLE, w->data(), sizeStatePerProc.data(), offsetStatePerProc.data(), MPI_DOUBLE, comm);
 
     numUpdates++;
 
@@ -206,7 +355,10 @@ void KalmanFilter::setParametersStandard(double const epsilon,
 
     q = q0;
     eta = eta0;
-    P /= epsilon;
+    for (auto& p : P)
+    {
+        p /= epsilon;
+    }
 
     return;
 }
@@ -226,7 +378,10 @@ void KalmanFilter::setParametersFadingMemory(double const epsilon,
     this->nu      = nu     ;
 
     q = q0;
-    P /= epsilon;
+    for (auto& p : P)
+    {
+        p /= epsilon;
+    }
     gamma = 1.0;
 
     return;
@@ -235,12 +390,30 @@ void KalmanFilter::setParametersFadingMemory(double const epsilon,
 string KalmanFilter::status(size_t epoch) const
 {
 
-    double Pasym = 0.5 * (P - P.transpose()).array().abs().mean();
-    double Pdiag = P.diagonal().array().abs().sum(); 
-    double Poffdiag = (P.array().abs().sum() - Pdiag)
-                   / (sizeState * (sizeState - 1));
+    double Pasym    = 0.0;
+    double Pdiag    = 0.0;
+    double Poffdiag = 0.0;
+    double Kmean    = 0.0;
+
+    for (size_t i = 0; i < myGroups.size(); ++i)
+    {
+        auto const& p = P.at(i);
+        auto const& k = K.at(i);
+        Pasym += 0.5 * (p - p.transpose()).array().abs().sum();
+        double pdiag = p.diagonal().array().abs().sum(); 
+        Pdiag += pdiag; 
+        Poffdiag += p.array().abs().sum() - pdiag;
+        Kmean += k.array().abs().sum();
+    }
+
+    MPI_Allreduce(MPI_IN_PLACE, &Pasym   , 1, MPI_DOUBLE, MPI_SUM, comm);
+    MPI_Allreduce(MPI_IN_PLACE, &Pdiag   , 1, MPI_DOUBLE, MPI_SUM, comm);
+    MPI_Allreduce(MPI_IN_PLACE, &Poffdiag, 1, MPI_DOUBLE, MPI_SUM, comm);
+    MPI_Allreduce(MPI_IN_PLACE, &Kmean   , 1, MPI_DOUBLE, MPI_SUM, comm);
+    Pasym /= (sizeP - sizeState);
     Pdiag /= sizeState;
-    double Kmean = K.array().abs().mean();
+    Poffdiag /= (sizeP - sizeState);
+    Kmean /= (sizeState * sizeObservation);
 
     string s = strpr("%10zu %10zu %16.8E %16.8E %16.8E %16.8E %16.8E",
                      epoch, numUpdates, Pdiag, Poffdiag, Pasym, Kmean, q);
@@ -314,11 +487,24 @@ vector<string> KalmanFilter::info() const
 {
     vector<string> v;
 
+    v.push_back("Extended Kalman Filter (EKF) optimizer:\n");
+    v.push_back(strpr("sizeState       = %zu\n", sizeState));
+    v.push_back(strpr("sizeObservation = %zu\n", sizeObservation));
+    v.push_back(strpr("myRank          = %d\n", myRank));
+    v.push_back(strpr("numProcs        = %d\n", numProcs));
+    v.push_back(strpr("OpenMP threads used                    : %d\n",
+                      nbThreads()));
+    v.push_back(strpr("Number of decoupling groups            : %zu\n",
+                      groupLimits.size()));
+    v.push_back(strpr("Number of decoupling groups of proc %3d: %zu\n",
+                      myRank, myGroups.size()));
+    v.push_back(strpr("P matrix size ratio (compared to GEKF) : %6.2f %%\n",
+                      100.0 * sizeP / (sizeState * sizeState)));
+    v.push_back("-----------------------------------------"
+                "--------------------------------------\n");
     if (type == KT_STANDARD)
     {
         v.push_back(strpr("KalmanType::KT_STANDARD (%d)\n", type));
-        v.push_back(strpr("sizeState       = %zu\n", sizeState));
-        v.push_back(strpr("sizeObservation = %zu\n", sizeObservation));
         v.push_back(strpr("epsilon         = %12.4E\n", epsilon));
         v.push_back(strpr("q0              = %12.4E\n", q0     ));
         v.push_back(strpr("qtau            = %12.4E\n", qtau   ));
@@ -330,8 +516,6 @@ vector<string> KalmanFilter::info() const
     else if (type == KT_FADINGMEMORY)
     {
         v.push_back(strpr("KalmanType::KT_FADINGMEMORY (%d)\n", type));
-        v.push_back(strpr("sizeState       = %zu\n", sizeState));
-        v.push_back(strpr("sizeObservation = %zu\n", sizeObservation));
         v.push_back(strpr("epsilon         = %12.4E\n", epsilon));
         v.push_back(strpr("q0              = %12.4E\n", q0     ));
         v.push_back(strpr("qtau            = %12.4E\n", qtau   ));
@@ -339,7 +523,17 @@ vector<string> KalmanFilter::info() const
         v.push_back(strpr("lambda          = %12.4E\n", lambda));
         v.push_back(strpr("nu              = %12.4E\n", nu    ));
     }
-    v.push_back(strpr("OpenMP threads used: %d\n", nbThreads()));
+    //for (size_t i = 0; i < groupLimits.size(); ++i)
+    //{
+    //    v.push_back(strpr(" - group %5zu size: %zu\n",
+    //                      i,
+    //                      groupLimits.at(i).second
+    //                      - groupLimits.at(i).first + 1));
+    //}
+    //for (auto g : myGroups)
+    //{
+    //    v.push_back(strpr(" - group %5zu size: %zu\n", g.first, g.second));
+    //}
 
     return v;
 }
diff --git a/src/libnnptrain/KalmanFilter.h b/src/libnnptrain/KalmanFilter.h
index 356cee949..0d409ac1c 100644
--- a/src/libnnptrain/KalmanFilter.h
+++ b/src/libnnptrain/KalmanFilter.h
@@ -22,6 +22,7 @@
 #include <Eigen/Core>
 #include <cstddef>
 #include <string>
+#include <utility>
 #include <vector>
 
 namespace nnp
@@ -50,6 +51,19 @@ class KalmanFilter : public Updater
     /** Destructor.
      */
     virtual ~KalmanFilter();
+    /** Basic MPI setup.
+     *
+     * @param[in] communicator Input communicator of calling program.
+     */
+    void                     setupMPI(MPI_Comm* communicator);
+    /** Set up Kalman filter decoupling.
+     *
+     * @param[in] groupLimits Vector containing starting and ending indices of
+     *                        each decoupling group.
+     */
+    void                     setupDecoupling(
+                                 std::vector<std::pair< std::size_t,
+                                     std::size_t>> groupLimits);
     /** Set observation vector size.
      *
      * @param[in] size Size of the observation vector.
@@ -194,10 +208,16 @@ class KalmanFilter : public Updater
 private:
     /// Kalman filter type.
     KalmanType                         type;
+    /// My process ID.
+    int                                myRank;
+    /// Total number of MPI processors.
+    int                                numProcs;
     /// Size of observation (measurement) vector.
     std::size_t                        sizeObservation;
     /// Total number of updates performed.
     std::size_t                        numUpdates;
+    /// Number of covariance matrix entries in memory (sum over all groups).
+    std::size_t                        sizeP;
     /// Error covariance initialization parameter @f$\epsilon@f$.
     double                             epsilon;
     /// Process noise @f$q@f$.
@@ -222,18 +242,34 @@ class KalmanFilter : public Updater
     double                             nu;
     /// Forgetting gain factor gamma for fading memory Kalman filter.
     double                             gamma;
+    /// Size of state vector segment handled by each processor.
+    std::vector<int>                   sizeStatePerProc;
+    /// Offset of state vector segment handled by each processor.
+    std::vector<int>                   offsetStatePerProc;
+    /// List of (group indices, group sizes) of this processor.
+    std::vector<std::pair<
+    std::size_t, std::size_t>>         myGroups;
+    /// Number of groups for each processor.
+    std::vector<std::size_t>           numGroupsPerProc;
+    /// Offset in terms of groups for each processor.
+    std::vector<std::size_t>           groupOffsetPerProc;
+    /// Decoupling group limits, i.e. starting end ending indices.
+    std::vector<std::pair<
+    std::size_t, std::size_t>>         groupLimits;
     /// State vector.
     Eigen::Map<Eigen::VectorXd>*       w;
     /// Error vector.
     Eigen::Map<Eigen::VectorXd const>* xi;
     /// Derivative matrix.
     Eigen::Map<Eigen::MatrixXd const>* H;
-    /// Error covariance matrix.
-    Eigen::MatrixXd                    P;
+    /// Error covariance matrices for all groups.
+    std::vector<Eigen::MatrixXd>       P;
     /// Kalman gain matrix.
-    Eigen::MatrixXd                    K;
+    std::vector<Eigen::MatrixXd>       K;
     /// Intermediate result X = P . H.
-    Eigen::MatrixXd                    X;
+    std::vector<Eigen::MatrixXd>       X;
+    /// Global MPI communicator.
+    MPI_Comm                           comm;
 };
 
 //////////////////////////////////
diff --git a/src/libnnptrain/Training.cpp b/src/libnnptrain/Training.cpp
index ba293c7d0..94de5b470 100644
--- a/src/libnnptrain/Training.cpp
+++ b/src/libnnptrain/Training.cpp
@@ -40,6 +40,7 @@ Training::Training() : Dataset(),
                        jacobianMode               (JM_SUM         ),
                        updateStrategy             (US_COMBINED    ),
                        selectionMode              (SM_RANDOM      ),
+                       decouplingType             (DT_GLOBAL      ),
                        hasUpdaters                (false          ),
                        hasStructures              (false          ),
                        useForces                  (false          ),
@@ -300,7 +301,13 @@ void Training::initializeWeights()
                 w.at(i).at(j) = minWeights + gsl_rng_uniform(rngGlobal)
                               * (maxWeights - minWeights);
             }
-            elements.at(i).neuralNetwork->setConnections(&(w.at(i).front()));
+#ifndef ALTERNATIVE_WEIGHT_ORDERING
+            // To be consistent this should actually be the non-AO version,
+            // but this way results stay comparable.
+            elements.at(i).neuralNetwork->setConnectionsAO(&(w.at(i).front()));
+#else
+            elements.at(i).neuralNetwork->setConnectionsAO(&(w.at(i).front()));
+#endif
         }
         if (settings.keywordExists("nguyen_widrow_weights_short"))
         {
@@ -991,6 +998,44 @@ void Training::setupTraining()
     {
         kalmanType = (KalmanFilter::KalmanType)
                      atoi(settings["kalman_type"].c_str());
+        decouplingType = (DecouplingType)
+                         atoi(settings["decoupling_type"].c_str());
+        if (decouplingType == DT_GLOBAL)
+        {
+            log << strpr("No decoupling (GEKF) selected: DT_GLOBAL (%d).\n",
+                         decouplingType);
+        }
+        else if (decouplingType == DT_ELEMENT)
+        {
+            log << strpr("Per-element decoupling (ED-GEKF) selected: "
+                         "DT_ELEMENT (%d).\n", decouplingType);
+        }
+        else if (decouplingType == DT_LAYER)
+        {
+            log << strpr("Per-layer decoupling selected: "
+                         "DT_LAYER (%d).\n", decouplingType);
+        }
+        else if (decouplingType == DT_NODE)
+        {
+            log << strpr("Per-node decoupling (NDEKF) selected: "
+                         "DT_NODE (%d).\n", decouplingType);
+        }
+        else if (decouplingType == DT_FULL)
+        {
+            log << strpr("Full (per-weight) decoupling selected: "
+                         "DT_FULL (%d).\n", decouplingType);
+        }
+        else
+        {
+            throw runtime_error("ERROR: Unknown Kalman filter decoupling "
+                                "type.\n");
+        }
+        if (decouplingType != DT_GLOBAL && updateStrategy != US_COMBINED)
+        {
+            throw runtime_error(strpr("ERROR: Kalman filter decoupling works "
+                                      "only in conjunction with "
+                                      "update_strategy %d\".\n", US_COMBINED));
+        }
     }
 
     for (size_t i = 0; i < numUpdaters; ++i)
@@ -1008,6 +1053,81 @@ void Training::setupTraining()
                 updaters.push_back(
                     (Updater*)new KalmanFilter(numWeightsPerUpdater.at(i),
                                                kalmanType));
+                KalmanFilter* u = dynamic_cast<KalmanFilter*>(updaters.back());
+                if (parallelMode == PM_TRAIN_ALL)
+                {
+                    u->setupMPI(&comm);
+                }
+                else
+                {
+                    MPI_Group groupWorld;
+                    MPI_Comm_group(comm, &groupWorld);
+                    int const size0 = 1;
+                    int const rank0[1] = {0};
+                    MPI_Group groupRank0;
+                    MPI_Group_incl(groupWorld, size0, rank0, &groupRank0);
+                    MPI_Comm commRank0;
+                    MPI_Comm_create_group(comm, groupRank0, 0, &commRank0);
+                    u->setupMPI(&commRank0);
+                }
+                vector<pair<size_t, size_t>> limits;
+                if (decouplingType == DT_GLOBAL)
+                {
+                    limits.push_back(
+                        make_pair(0, numWeightsPerUpdater.at(i) - 1));
+                }
+                else if (decouplingType == DT_ELEMENT)
+                {
+                    for (size_t j = 0; j < numElements; ++j)
+                    {
+                        limits.push_back(make_pair(
+                            weightsOffset.at(j),
+                            weightsOffset.at(j) + elements.at(j).
+                                neuralNetwork->getNumConnections() - 1));
+                    }
+                }
+                else if (decouplingType == DT_LAYER)
+                {
+                    for (size_t j = 0; j < numElements; ++j)
+                    {
+                        for (auto l : elements.at(j).neuralNetwork
+                                      ->getLayerLimits())
+                        {
+                            limits.push_back(make_pair(
+                                weightsOffset.at(j) + l.first,
+                                weightsOffset.at(j) + l.second));
+                        }
+                    }
+                }
+                else if (decouplingType == DT_NODE)
+                {
+#ifndef ALTERNATIVE_WEIGHT_ORDERING
+                    for (size_t j = 0; j < numElements; ++j)
+                    {
+                        for (auto l : elements.at(j).neuralNetwork
+                                      ->getNeuronLimits())
+                        {
+                            limits.push_back(make_pair(
+                                weightsOffset.at(j) + l.first,
+                                weightsOffset.at(j) + l.second));
+                        }
+                    }
+#else
+                    throw runtime_error("ERROR: Node-decoupled Kalman filter "
+                                        "(NDEFK) training not possible with "
+                                        "alternative (old) weight memory "
+                                        "layout, recompile without "
+                                        "-DALTERNATIVE_WEIGHT_ORDERING.\n");
+#endif
+                }
+                else if (decouplingType == DT_FULL)
+                {
+                    for (size_t j = 0; j < numWeightsPerUpdater.at(i); ++j)
+                    {
+                        limits.push_back(make_pair(j, j));
+                    }
+                }
+                u->setupDecoupling(limits);
             }
             updaters.back()->setState(&(weights.at(i).front()));
         }
@@ -1432,7 +1552,7 @@ void Training::calculateErrorEpoch()
         fileNameForcesTest = strpr("testforces.%06zu.out", epoch);
     }
 
-    // Calculate RMSE and write comparison files.
+    // Calculate error and write comparison files.
     calculateError(true,
                    identifier,
                    fileNameEnergiesTrain,
@@ -1452,7 +1572,9 @@ void Training::writeWeights(string const fileNameFormat) const
         string fileName = strpr(fileNameFormat.c_str(),
                                 elements.at(i).getAtomicNumber());
         file.open(fileName.c_str());
-        elements.at(i).neuralNetwork->writeConnections(file);
+        // Attention: need alternative (old) ordering scheme here for
+        // backward compatibility!
+        elements.at(i).neuralNetwork->writeConnectionsAO(file);
         file.close();
     }
 
@@ -1629,7 +1751,7 @@ void Training::writeNeuronStatistics(string const fileName) const
         vector<string> colInfo;
         vector<size_t> colSize;
         title.push_back("Statistics for individual neurons gathered during "
-                        "RMSE calculation.");
+                        "error calculation.");
         colSize.push_back(10);
         colName.push_back("element");
         colInfo.push_back("Element index.");
@@ -1743,24 +1865,31 @@ void Training::writeUpdaterStatus(bool         append,
 
     for (size_t i = 0; i < numUpdaters; ++i)
     {
-        string fileName;
-        if (updateStrategy == US_COMBINED)
-        {
-            fileName = strpr(fileNameFormat.c_str(), 0);
-        }
-        else if (updateStrategy == US_ELEMENT)
+        if (myRank == 0)
         {
-            fileName = strpr(fileNameFormat.c_str(),
-                             elementMap.atomicNumber(i));
+            string fileName;
+            if (updateStrategy == US_COMBINED)
+            {
+                fileName = strpr(fileNameFormat.c_str(), 0);
+            }
+            else if (updateStrategy == US_ELEMENT)
+            {
+                fileName = strpr(fileNameFormat.c_str(),
+                                 elementMap.atomicNumber(i));
+            }
+            if (append) file.open(fileName.c_str(), ofstream::app);
+            else
+            {
+                file.open(fileName.c_str());
+                appendLinesToFile(file, updaters.at(i)->statusHeader());
+            }
+            file << updaters.at(i)->status(epoch);
+            file.close();
         }
-        if (append) file.open(fileName.c_str(), ofstream::app);
-        else 
+        else if (parallelMode == PM_TRAIN_ALL)
         {
-            file.open(fileName.c_str());
-            appendLinesToFile(file, updaters.at(i)->statusHeader());
+            updaters.at(i)->status(epoch);
         }
-        file << updaters.at(i)->status(epoch);
-        file.close();
     }
 
     return;
@@ -1884,7 +2013,7 @@ void Training::loop()
     else if (errorMetricForces == 1) metric = "MAE";
     if (errorMetricEnergies % 2 == 0) peratom = "per atom ";
 
-    log << "The training loop output covers different RMSEs, update and\n";
+    log << "The training loop output covers different errors, update and\n";
     log << "timing information. The following quantities are organized\n";
     log << "according to the matrix scheme below:\n";
     log << "-------------------------------------------------------------------\n";
@@ -1911,7 +2040,7 @@ void Training::loop()
     log << "Abbreviations:\n";
     log << "  p. u. = physical units.\n";
     log << "  i. u. = internal units.\n";
-    log << "Note: RMSEs in internal units (columns 5 + 6) are only present \n";
+    log << "Note: Errors in internal units (columns 5 + 6) are only present \n";
     log << "      if data set normalization is used.\n";
     log << "-------------------------------------------------------------------\n";
     log << "     1    2             3             4             5             6\n";
@@ -1940,8 +2069,9 @@ void Training::loop()
     // Write learning curve.
     if (myRank == 0) writeLearningCurve(false);
 
-    // Write updater status to file.
-    if (myRank == 0) writeUpdaterStatus(false);
+    // Write updater status to file (decoupled KF requires all tasks to execute
+    // the status() function.
+    writeUpdaterStatus(false);
 
     // Write neuron statistics.
     writeNeuronStatisticsEpoch();
@@ -2013,7 +2143,7 @@ void Training::loop()
         if (myRank == 0) writeLearningCurve(true);
 
         // Write updater status to file.
-        if (myRank == 0) writeUpdaterStatus(true);
+        writeUpdaterStatus(true);
 
         // Write neuron statistics.
         writeNeuronStatisticsEpoch();
@@ -2280,6 +2410,22 @@ void Training::update(bool force)
             else
             {
                 nn->calculateDEdc(&(dXdc.at(i).front()));
+                //vector<double> temp(dXdc.at(i).size(), 0.0);
+                //vector<double> tempAO(dXdc.at(i).size(), 0.0);
+                //nn->calculateDEdc(&(temp.front()));
+                //ofstream tf("dedc.out");
+                //size_t count;
+                //count = 0;
+                //sort(temp.begin(), temp.end());
+                //for (auto i : temp) tf << strpr("%zu %16.8E\n", count++, i);
+                //tf.close();
+                //nn->calculateDEdcAO(&(tempAO.front()));
+                //tf.open("dedc.out.ao");
+                //count = 0;
+                //sort(tempAO.begin(), tempAO.end());
+                //for (auto i : tempAO) tf << strpr("%zu %16.8E\n", count++, i);
+                //tf.close();
+                //throw runtime_error("END HERE");
             }
             // Finally sum up Jacobian.
             if (updateStrategy == US_ELEMENT) iu = i;
@@ -2715,7 +2861,11 @@ void Training::getWeights()
         for (size_t i = 0; i < numElements; ++i)
         {
             NeuralNetwork const* const& nn = elements.at(i).neuralNetwork;
+#ifndef ALTERNATIVE_WEIGHT_ORDERING
             nn->getConnections(&(weights.at(0).at(pos)));
+#else
+            nn->getConnectionsAO(&(weights.at(0).at(pos)));
+#endif
             pos += nn->getNumConnections();
         }
     }
@@ -2724,7 +2874,11 @@ void Training::getWeights()
         for (size_t i = 0; i < numElements; ++i)
         {
             NeuralNetwork const* const& nn = elements.at(i).neuralNetwork;
+#ifndef ALTERNATIVE_WEIGHT_ORDERING
             nn->getConnections(&(weights.at(i).front()));
+#else
+            nn->getConnectionsAO(&(weights.at(i).front()));
+#endif
         }
     }
 
@@ -2739,7 +2893,11 @@ void Training::setWeights()
         for (size_t i = 0; i < numElements; ++i)
         {
             NeuralNetwork* const& nn = elements.at(i).neuralNetwork;
+#ifndef ALTERNATIVE_WEIGHT_ORDERING
             nn->setConnections(&(weights.at(0).at(pos)));
+#else
+            nn->setConnectionsAO(&(weights.at(0).at(pos)));
+#endif
             pos += nn->getNumConnections();
         }
     }
@@ -2748,7 +2906,11 @@ void Training::setWeights()
         for (size_t i = 0; i < numElements; ++i)
         {
             NeuralNetwork* const& nn = elements.at(i).neuralNetwork;
+#ifndef ALTERNATIVE_WEIGHT_ORDERING
             nn->setConnections(&(weights.at(i).front()));
+#else
+            nn->setConnectionsAO(&(weights.at(i).front()));
+#endif
         }
     }
 
diff --git a/src/libnnptrain/Training.h b/src/libnnptrain/Training.h
index 1ae678255..93022ebc8 100644
--- a/src/libnnptrain/Training.h
+++ b/src/libnnptrain/Training.h
@@ -110,6 +110,21 @@ class Training : public Dataset
         SM_THRESHOLD
     };
 
+    /// Kalman filter decoupling type (requires UT_KF and US_COMBINED).
+    enum DecouplingType
+    {
+        /// No decoupling, global extended Kalman filter (GEKF).
+        DT_GLOBAL,
+        /// Per-element decoupling, ED-GEKF (doi.org/10.1021/acs.jctc.5b00211).
+        DT_ELEMENT,
+        /// Layer decoupling.
+        DT_LAYER,
+        /// Node decoupling, NDEKF.
+        DT_NODE,
+        /// Full decoupling, FDEKF.
+        DT_FULL
+    };
+
     /// Constructor.
     Training();
     /// Destructor, updater vector needs to be cleaned.
@@ -309,6 +324,8 @@ class Training : public Dataset
     UpdateStrategy                updateStrategy;
     /// Selection mode for update candidates.
     SelectionMode                 selectionMode;
+    /// Kalman filter decoupling type if KF training selected.
+    DecouplingType                decouplingType;
     /// If this rank performs weight updates.
     bool                          hasUpdaters;
     /// If this rank holds structure information.
diff --git a/src/makefile.gnu b/src/makefile.gnu
index 223a03485..ece0147ce 100644
--- a/src/makefile.gnu
+++ b/src/makefile.gnu
@@ -56,3 +56,6 @@ PROJECT_OPTIONS+= -DEIGEN_DONT_PARALLELIZE
 
 # Use improved memory layout for symmetry function derivatives.
 PROJECT_OPTIONS+= -DIMPROVED_SFD_MEMORY
+
+# Use alternative (old) ordering scheme of weights (NDEKF not possible).
+#PROJECT_OPTIONS+= -DALTERNATIVE_WEIGHT_ORDERING
diff --git a/src/makefile.intel b/src/makefile.intel
index 17e61b7a0..4a9a0e94c 100644
--- a/src/makefile.intel
+++ b/src/makefile.intel
@@ -57,3 +57,6 @@ PROJECT_OPTIONS+= -DEIGEN_DONT_PARALLELIZE
 
 # Use improved memory layout for symmetry function derivatives.
 PROJECT_OPTIONS+= -DIMPROVED_SFD_MEMORY
+
+# Use alternative (old) ordering scheme of weights (NDEKF not possible).
+#PROJECT_OPTIONS+= -DALTERNATIVE_WEIGHT_ORDERING
diff --git a/src/makefile.llvm b/src/makefile.llvm
index b748ce08b..30deb9a03 100644
--- a/src/makefile.llvm
+++ b/src/makefile.llvm
@@ -58,3 +58,6 @@ PROJECT_OPTIONS+= -DEIGEN_DONT_PARALLELIZE
 
 # Use improved memory layout for symmetry function derivatives.
 PROJECT_OPTIONS+= -DIMPROVED_SFD_MEMORY
+
+# Use alternative (old) ordering scheme of weights (NDEKF not possible).
+#PROJECT_OPTIONS+= -DALTERNATIVE_WEIGHT_ORDERING
diff --git a/test/cpp/Example_nnp_train.h b/test/cpp/Example_nnp_train.h
index fe4aa3d22..88ed24a76 100644
--- a/test/cpp/Example_nnp_train.h
+++ b/test/cpp/Example_nnp_train.h
@@ -28,13 +28,13 @@ void BoostDataContainer<Example_nnp_train>::setup()
     e->rmseForcesTrain = 3.77302629E-02;
     e->rmseForcesTest  = 2.25135617E-01;
 
-    //examples.push_back(Example_nnp_train("H2O_RPBE-D3"));
-    //e = &(examples.back());
-    //e->lastEpoch = 10;
-    //e->rmseEnergyTrain = 1.02312914E-04;
-    //e->rmseEnergyTest  = 5.28837597E-04;
-    //e->rmseForcesTrain = 1.67069652E-02;
-    //e->rmseForcesTest  = 1.07708383E-02;
+    examples.push_back(Example_nnp_train("H2O_RPBE-D3"));
+    e = &(examples.back());
+    e->lastEpoch = 5;
+    e->rmseEnergyTrain = 9.19395999E-04;
+    e->rmseEnergyTest  = 7.42520683E-04;
+    e->rmseForcesTrain = 2.40682415E-02;
+    e->rmseForcesTest  = 1.44776060E-02;
 
     return;
 }