davisking · Cydral · Apr 28, 2025 · May 2, 2025 · May 3, 2025 · May 6, 2025
diff --git a/dlib/cuda/cpu_dlib.cpp b/dlib/cuda/cpu_dlib.cpp
@@ -1494,7 +1494,6 @@ namespace dlib
                 }
                 p_scale[n] = 1.0f / std::sqrt(p_scale[n] / (ks * num) + static_cast<float>(eps));
             }
-            scale.host();
 
             // Apply RMS normalization
             p_src = src.host();
@@ -1648,14 +1647,22 @@ namespace dlib
                             for (long k = 0; k < num_channels; ++k)
                                 max_val = std::max(max_val, ss[k * num_locations]);
 
-                            float sum = 0.0f;
-                            for (long k = 0; k < num_channels; ++k)
+                            if (max_val == -std::numeric_limits<float>::infinity())
                             {
-                                dd[k * num_locations] = std::exp(ss[k * num_locations] - max_val);
-                                sum += dd[k * num_locations];
+                                for (long k = 0; k < num_channels; ++k)
+                                    dd[k * num_locations] = 0.0f;
+                            }
+                            else
+                            {
+                                float sum = 0.0f;
+                                for (long k = 0; k < num_channels; ++k)
+                                {
+                                    dd[k * num_locations] = std::exp(ss[k * num_locations] - max_val);
+                                    sum += dd[k * num_locations];
+                                }
+                                for (long k = 0; k < num_channels; ++k)
+									dd[k * num_locations] /= sum;
                             }
-                            for (long k = 0; k < num_channels; ++k)
-                                dd[k * num_locations] /= sum;
 
                             ++ss;
                             ++dd;
@@ -3366,6 +3373,69 @@ namespace dlib
             }
         }
 
+    // ------------------------------------------------------------------------------------
+
+        void apply_rotary_positional_embedding(
+            bool is_backward,
+            resizable_tensor& data,
+            const resizable_tensor& cos_cache,
+            const resizable_tensor& sin_cache)
+        {
+            const long batch_size = data.num_samples();
+            const long num_heads = data.k();
+            const long seq_len = data.nr();
+            const long d_head = data.nc();
+            const long half_d = d_head / 2;
+
+            DLIB_CASSERT(cos_cache.nr() == seq_len, "cos_cache rows must match seq_len");
+            DLIB_CASSERT(cos_cache.nc() == half_d, "cos_cache cols must be d_head/2");
+            DLIB_CASSERT(sin_cache.nr() == seq_len, "sin_cache rows must match seq_len");
+            DLIB_CASSERT(sin_cache.nc() == half_d, "sin_cache cols must be d_head/2");
+
+            const bool is_odd = (d_head % 2 != 0);
+            const long rot_dim = is_odd ? d_head - 1 : d_head;
+
+            float* data_ptr = data.host();
+            const float* cos_ptr = cos_cache.host();
+            const float* sin_ptr = sin_cache.host();
+
+            const size_t total_elements = batch_size * num_heads * seq_len * half_d;
+
+            parallel_for(0, total_elements, [&](long idx)
+            {
+                const long pair_idx = idx % half_d;
+                const long pos = (idx / half_d) % seq_len;
+                const long head = (idx / (half_d * seq_len)) % num_heads;
+                const long batch = idx / (half_d * seq_len * num_heads);
+
+                const long dim_i = pair_idx * 2;
+                if (dim_i >= rot_dim) return;
+
+                const long data_offset = ((batch * num_heads + head) * seq_len + pos) * d_head + dim_i;
+                const long trig_offset = pos * half_d + pair_idx;
+
+                const float c = cos_ptr[trig_offset];
+                const float s = sin_ptr[trig_offset];
+                const float x0 = data_ptr[data_offset];
+                const float x1 = data_ptr[data_offset + 1];
+
+                if (!is_backward)
+                {
+                    // Forward: [cos -sin] [x0]
+                    //          [sin  cos] [x1]
+                    data_ptr[data_offset] = x0 * c - x1 * s;
+                    data_ptr[data_offset + 1] = x0 * s + x1 * c;
+                }
+                else
+                {
+                    // Backward (inverse rotation): [cos  sin] [x0]
+                    //                              [-sin cos] [x1]
+                    data_ptr[data_offset] = x0 * c + x1 * s;
+                    data_ptr[data_offset + 1] = -x0 * s + x1 * c;
+                }
+            });
+        }
+
     // ------------------------------------------------------------------------------------
 
     } 

diff --git a/dlib/cuda/cpu_dlib.h b/dlib/cuda/cpu_dlib.h
@@ -584,6 +584,15 @@ namespace dlib
             float scale_factor
         );
 
+    // -----------------------------------------------------------------------------------
+
+        void apply_rotary_positional_embedding(
+            bool is_backward,
+            resizable_tensor& data,
+            const resizable_tensor& cos_cache,
+            const resizable_tensor& sin_cache
+        );
+
     // -----------------------------------------------------------------------------------
 
         class pooling
@@ -761,6 +770,138 @@ namespace dlib
 
     // -----------------------------------------------------------------------------------
 
+    class compute_loss_cross_entropy_per_logit
+    {
+        /*!
+            Computes cross-entropy loss for causal language modeling
+            Uses all sequence positions (except last) for training
+            Each position t predicts the token at position t+1
+        !*/
+    public:
+        compute_loss_cross_entropy_per_logit() {}
+
+        template <typename const_label_iterator>
+        void operator()(
+            const_label_iterator truth,
+            const tensor& input_tensor,
+            const tensor& output_tensor,
+            tensor& grad,
+            double& loss,
+            long ignore_index
+        ) const
+        {
+            DLIB_CASSERT(output_tensor.k() == 1);
+            DLIB_CASSERT(input_tensor.k() == 1);
+            DLIB_CASSERT(input_tensor.nc() == 1);
+
+            const long batch_size = output_tensor.num_samples();
+            const long seq_len = output_tensor.nr();
+            const long vocab_size = output_tensor.nc();
+
+            const float* out_data = output_tensor.host();
+            const float* in_data = input_tensor.host();
+            float* g = grad.host();            
+
+            std::fill(g, g + grad.size(), 0.0f);
+
+            long valid_tokens = 0;
+
+            if (ignore_index < 0)
+            {
+                valid_tokens = batch_size * seq_len;
+            }
+            else {
+                for (long i = 0; i < batch_size; ++i)
+                {
+                    for (long t = 0; t < seq_len; ++t)
+                    {
+                        unsigned long target_class;
+                        if (t < seq_len - 1) {
+                            target_class = static_cast<unsigned long>(
+                                in_data[tensor_index(input_tensor, i, 0, t + 1, 0)]
+                                );
+                        }
+                        else
+                            target_class = *(truth + i);
+
+                        if (static_cast<long>(target_class) != ignore_index)
+                            valid_tokens++;
+                    }
+                }
+            }
+            if (valid_tokens == 0)
+            {
+                loss = 0.0;
+                return;
+            }
+
+            const double scale = 1.0 / valid_tokens;
+            loss = 0.0;
+
+            for (long i = 0; i < batch_size; ++i)
+            {
+                // Loop over all positions (0 to seq_len-1)
+                for (long t = 0; t < seq_len; ++t)
+                {
+                    unsigned long target_class;
+
+                    // Extract target token
+                    if (t < seq_len - 1) {
+                        // For positions 0 to seq_len-2: target from input_tensor[t+1]
+                        target_class = static_cast<unsigned long>(
+                            in_data[tensor_index(input_tensor, i, 0, t + 1, 0)]
+                        );
+                    } else {
+                        // For last position (seq_len-1): target from truth
+                        target_class = *(truth + i);
+                    }
+
+                    if (ignore_index >= 0 && static_cast<long>(target_class) == ignore_index)
+                        continue;
+
+                    DLIB_CASSERT(target_class < static_cast<unsigned long>(vocab_size));
+
+                    // Find max logit for numerical stability
+                    float max_val = out_data[tensor_index(output_tensor, i, 0, t, 0)];
+                    for (long c = 1; c < vocab_size; ++c)
+                    {
+                        const float val = out_data[tensor_index(output_tensor, i, 0, t, c)];
+                        max_val = std::max(max_val, val);
+                    }
+
+                    // Compute softmax denominator
+                    float sum_exp = 0.0f;
+                    for (long c = 0; c < vocab_size; ++c)
+                    {
+                        const unsigned long idx = tensor_index(output_tensor, i, 0, t, c);
+                        const float exp_val = std::exp(out_data[idx] - max_val);
+                        g[idx] = exp_val;
+                        sum_exp += exp_val;
+                    }
+
+                    // Compute loss and gradients
+                    for (long c = 0; c < vocab_size; ++c)
+                    {
+                        const unsigned long idx = tensor_index(output_tensor, i, 0, t, c);
+                        const float softmax_val = g[idx] / sum_exp;
+
+                        if (static_cast<unsigned long>(c) == target_class)
+                        {
+                            loss += scale * (-std::log(std::max(softmax_val, 1e-10f)));
+                            g[idx] = scale * (softmax_val - 1.0f);
+                        }
+                        else
+                        {
+                            g[idx] = scale * softmax_val;
+                        }
+                    }
+                }
+            }
+        }
+    };
+
+    // -----------------------------------------------------------------------------------
+
     class compute_loss_binary_log_per_pixel
     {
 

diff --git a/dlib/cuda/cublas_dlibapi.cpp b/dlib/cuda/cublas_dlibapi.cpp
@@ -159,40 +159,43 @@ namespace dlib
                 const auto transa = trans_lhs ? CUBLAS_OP_T : CUBLAS_OP_N;
                 const auto transb = trans_rhs ? CUBLAS_OP_T : CUBLAS_OP_N;
 
-                long num_samples = std::min({ lhs.num_samples(), rhs.num_samples(), dest.num_samples() });
-                long num_channels = std::min({ lhs.k(), rhs.k(), dest.k() });
-
-                auto is_matrix = [](const auto& tensor) {
-                    return ((tensor.num_samples() * tensor.k() == 1 && tensor.nr() * tensor.nc() > 1) ||
-                        (tensor.num_samples() * tensor.k() > 1 && tensor.nr() * tensor.nc() == 1));
-                };
-                const bool lhs_is_matrix = is_matrix(lhs), rhs_is_matrix = is_matrix(rhs), dest_is_matrix = is_matrix(dest);
-
-                if (lhs_is_matrix && rhs_is_matrix && dest_is_matrix) num_samples = num_channels = 1;
+                const bool lhs_is_matrix = is_2d_matrix(lhs);
+                const bool rhs_is_matrix = is_2d_matrix(rhs);
+                const bool dest_is_matrix = is_2d_matrix(dest);
+
+                const size_t lhs_plane_size = lhs.nr() * lhs.nc();
+                const size_t rhs_plane_size = rhs.nr() * rhs.nc();
+                const size_t dest_plane_size = dest.nr() * dest.nc();
+
+                long num_samples, num_channels = std::min({ lhs.k(), rhs.k(), dest.k() });
+                if (lhs_is_matrix && rhs_is_matrix && dest_is_matrix)
+                    num_samples = 1;
+                else if (!lhs_is_matrix && rhs_is_matrix)
+                    num_samples = lhs.num_samples();
+                else
+                    num_samples = std::min({ lhs.num_samples(), rhs.num_samples(), dest.num_samples() });
 
                 size_t lhs_rows = lhs.nr();
                 size_t lhs_cols = lhs.nc();
                 if (lhs_is_matrix && (lhs.num_samples() > 1 || lhs.k() > 1)) {
                     lhs_rows = lhs.num_samples();
                     lhs_cols = lhs.k();
                 }
+
                 size_t rhs_rows = rhs.nr();
                 size_t rhs_cols = rhs.nc();
                 if (rhs_is_matrix && (rhs.num_samples() > 1 || rhs.k() > 1)) {
                     rhs_rows = rhs.num_samples();
                     rhs_cols = rhs.k();
                 }
+
                 size_t dest_rows = dest.nr();
                 size_t dest_cols = dest.nc();
                 if (dest_is_matrix && (dest.num_samples() > 1 || dest.k() > 1)) {
                     dest_rows = dest.num_samples();
                     dest_cols = dest.k();
                 }
 
-                const size_t lhs_plane_size = lhs_rows * lhs_cols;
-                const size_t rhs_plane_size = rhs_rows * rhs_cols;
-                const size_t dest_plane_size = dest_rows * dest_cols;
-
                 for (long b = 0; b < num_samples; ++b)
                 {
                     for (long c = 0; c < num_channels; ++c)
@@ -203,12 +206,18 @@ namespace dlib
                             rhs.device() + (b * num_channels + c) * rhs_plane_size;
                         auto dest_slice = dest_is_matrix ? dest.device() :
                             dest.device() + (b * num_channels + c) * dest_plane_size;
+
                         const int k = trans_rhs ? rhs_cols : rhs_rows;
 
                         CHECK_CUBLAS(cublasSgemm(
-                            context(), transb, transa, dest_cols, dest_rows, k,
-                            &alpha, rhs_slice, rhs_cols, lhs_slice, lhs_cols,
-                            &beta, dest_slice, dest_cols
+                            context(),
+                            transb, transa,
+                            dest_cols, dest_rows, k,
+                            &alpha,
+                            rhs_slice, rhs_cols,
+                            lhs_slice, lhs_cols,
+                            &beta,
+                            dest_slice, dest_cols
                         ));
                     }
                 }