Armadillo 15.0.2

eddelbuettel · eddelbuettel · commit 2edb549cf48c · 2025-09-08T06:36:09.000-05:00
diff --git a/ChangeLog b/ChangeLog
@@ -1,3 +1,7 @@
+2025-09-08  Dirk Eddelbuettel  <edd@debian.org>
+
+	* inst/include/current/: Armadillo 15.0.2
+
 2025-09-05  Dirk Eddelbuettel  <edd@debian.org>
 
 	* inst/include/armadillo: Add full copyright header
@@ -7,7 +11,6 @@
 	* inst/include/armadillo: Moved back up from legacy/ to restore path
 	* inst/include/armadillo_bits/: Idem
 
-
 2025-09-01  Dirk Eddelbuettel  <edd@debian.org>
 
 	* DESCRIPTION (Version, Date): RcppArmadillo 15.0.1-1
diff --git a/inst/NEWS.Rd b/inst/NEWS.Rd
@@ -3,6 +3,16 @@
 \newcommand{\ghpr}{\href{https://github.com/RcppCore/RcppArmadillo/pull/#1}{##1}}
 \newcommand{\ghit}{\href{https://github.com/RcppCore/RcppArmadillo/issues/#1}{##1}}
 
+\section{Changes in RcppArmadillo version 15.0.2-1 (2025-09-08)}{
+  \itemize{
+    \item Upgraded to Armadillo release 15.0.2-1 (Medium Roast)
+    \itemize{
+      \item Optionally use OpenMP parallelisation for fp16 matrix multiplication
+      \item Faster vectorisation of cube tubes
+    }
+  }
+}
+
 \section{Changes in RcppArmadillo version 15.0.1-1 (2025-09-01)}{
   \itemize{
     \item Upgraded to Armadillo release 15.0.1-1 (Medium Roast)
diff --git a/inst/include/current/armadillo_bits/arma_version.hpp b/inst/include/current/armadillo_bits/arma_version.hpp
@@ -23,7 +23,7 @@
 
 #define ARMA_VERSION_MAJOR 15
 #define ARMA_VERSION_MINOR 0
-#define ARMA_VERSION_PATCH 1
+#define ARMA_VERSION_PATCH 2
 #define ARMA_VERSION_NAME  "Medium Roast"
 
 
diff --git a/inst/include/current/armadillo_bits/mul_gemm.hpp b/inst/include/current/armadillo_bits/mul_gemm.hpp
@@ -59,9 +59,36 @@ struct gemm_emul_tinysq
 
 
 
+struct gemm_emul_large_mp_helper
+  {
+  template<typename eT>
+  arma_hot
+  inline
+  static
+  void
+  copy_row(eT* out_mem, const Mat<eT>& in, const uword row)
+    {
+    const uword n_rows = in.n_rows;
+    const uword n_cols = in.n_cols;
+    
+    const eT* in_mem_row = in.memptr() + row;
+    
+    for(uword i=0; i < n_cols; ++i)
+      {
+      out_mem[i] = (*in_mem_row);
+      
+      in_mem_row += n_rows;
+      }
+    }
+  };
+
+
+
+#if defined(ARMA_USE_OPENMP)
 //! emulation of gemm(), for non-complex matrices only, as it assumes only simple transposes (ie. doesn't do hermitian transposes)
+//! parallelised version
 template<const bool do_trans_A=false, const bool do_trans_B=false, const bool use_alpha=false, const bool use_beta=false>
-struct gemm_emul_large
+struct gemm_emul_large_mp
   {
   template<typename eT, typename TA, typename TB>
   arma_hot
@@ -78,13 +105,151 @@ struct gemm_emul_large
     )
     {
     arma_debug_sigprint();
+    
+    const uword A_n_rows = A.n_rows;
+    const uword A_n_cols = A.n_cols;
+    
+    const uword B_n_rows = B.n_rows;
+    const uword B_n_cols = B.n_cols;
+    
+    if( (do_trans_A == false) && (do_trans_B == false) )
+      {
+      const uword n_threads = uword(mp_thread_limit::get());
+      
+      podarray<eT> tmp(A_n_cols * n_threads, arma_nozeros_indicator());
+      
+      eT* tmp_mem = tmp.memptr();
+      
+      #pragma omp parallel for schedule(static) num_threads(int(n_threads))
+      for(uword row_A=0; row_A < A_n_rows; ++row_A)
+        {
+        const uword thread_id = uword(omp_get_thread_num());
+        
+        eT* A_rowdata = tmp_mem + (A_n_cols * thread_id);
+        
+        gemm_emul_large_mp_helper::copy_row(A_rowdata, A, row_A);
+        
+        for(uword col_B=0; col_B < B_n_cols; ++col_B)
+          {
+          const eT acc = op_dot::direct_dot(B_n_rows, A_rowdata, B.colptr(col_B));
+          
+               if( (use_alpha == false) && (use_beta == false) )  { C.at(row_A,col_B) =       acc;                          }
+          else if( (use_alpha == true ) && (use_beta == false) )  { C.at(row_A,col_B) = alpha*acc;                          }
+          else if( (use_alpha == false) && (use_beta == true ) )  { C.at(row_A,col_B) =       acc + beta*C.at(row_A,col_B); }
+          else if( (use_alpha == true ) && (use_beta == true ) )  { C.at(row_A,col_B) = alpha*acc + beta*C.at(row_A,col_B); }
+          }
+        }
+      }
+    else
+    if( (do_trans_A == true) && (do_trans_B == false) )
+      {
+      const int n_threads = mp_thread_limit::get();
+      
+      #pragma omp parallel for schedule(static) num_threads(n_threads)
+      for(uword col_A=0; col_A < A_n_cols; ++col_A)
+        {
+        // col_A is interpreted as row_A when storing the results in matrix C
+        
+        const eT* A_coldata = A.colptr(col_A);
+        
+        for(uword col_B=0; col_B < B_n_cols; ++col_B)
+          {
+          const eT acc = op_dot::direct_dot(B_n_rows, A_coldata, B.colptr(col_B));
+          
+               if( (use_alpha == false) && (use_beta == false) )  { C.at(col_A,col_B) =       acc;                          }
+          else if( (use_alpha == true ) && (use_beta == false) )  { C.at(col_A,col_B) = alpha*acc;                          }
+          else if( (use_alpha == false) && (use_beta == true ) )  { C.at(col_A,col_B) =       acc + beta*C.at(col_A,col_B); }
+          else if( (use_alpha == true ) && (use_beta == true ) )  { C.at(col_A,col_B) = alpha*acc + beta*C.at(col_A,col_B); }
+          }
+        }
+      }
+    else
+    if( (do_trans_A == false) && (do_trans_B == true) )
+      {
+      Mat<eT> BB;
+      op_strans::apply_mat_noalias(BB, B);
+      
+      gemm_emul_large_mp<false, false, use_alpha, use_beta>::apply(C, A, BB, alpha, beta);
+      }
+    else
+    if( (do_trans_A == true) && (do_trans_B == true) )
+      {
+      // using trans(A)*trans(B) = trans(B*A) equivalency; assuming no hermitian transpose
+      
+      const uword n_threads = uword(mp_thread_limit::get());
+      
+      podarray<eT> tmp(B_n_cols * n_threads, arma_nozeros_indicator());
+      
+      eT* tmp_mem = tmp.memptr();
+      
+      #pragma omp parallel for schedule(static) num_threads(int(n_threads))
+      for(uword row_B=0; row_B < B_n_rows; ++row_B)
+        {
+        const uword thread_id = uword(omp_get_thread_num());
+        
+        eT* B_rowdata = tmp_mem + (B_n_cols * thread_id);
+        
+        gemm_emul_large_mp_helper::copy_row(B_rowdata, B, row_B);
+        
+        for(uword col_A=0; col_A < A_n_cols; ++col_A)
+          {
+          const eT acc = op_dot::direct_dot(A_n_rows, B_rowdata, A.colptr(col_A));
+          
+               if( (use_alpha == false) && (use_beta == false) )  { C.at(col_A,row_B) =       acc;                          }
+          else if( (use_alpha == true ) && (use_beta == false) )  { C.at(col_A,row_B) = alpha*acc;                          }
+          else if( (use_alpha == false) && (use_beta == true ) )  { C.at(col_A,row_B) =       acc + beta*C.at(col_A,row_B); }
+          else if( (use_alpha == true ) && (use_beta == true ) )  { C.at(col_A,row_B) = alpha*acc + beta*C.at(col_A,row_B); }
+          }
+        }
+      }
+    }
+  
+  };
+#endif
+
+
 
+//! emulation of gemm(), for non-complex matrices only, as it assumes only simple transposes (ie. doesn't do hermitian transposes)
+template<const bool do_trans_A=false, const bool do_trans_B=false, const bool use_alpha=false, const bool use_beta=false>
+struct gemm_emul_large
+  {
+  template<typename eT, typename TA, typename TB>
+  arma_hot
+  inline
+  static
+  void
+  apply
+    (
+          Mat<eT>& C,
+    const TA&      A,
+    const TB&      B,
+    const eT       alpha = eT(1),
+    const eT       beta  = eT(0)
+    )
+    {
+    arma_debug_sigprint();
+    
     const uword A_n_rows = A.n_rows;
     const uword A_n_cols = A.n_cols;
     
     const uword B_n_rows = B.n_rows;
     const uword B_n_cols = B.n_cols;
     
+    #if defined(ARMA_USE_OPENMP)
+      {
+      // TODO: replace with more sophisticated threshold mechanism
+      
+      constexpr uword threshold = uword(30);
+      
+      if( (A_n_rows >= threshold) && (A_n_cols >= threshold) && (B_n_rows >= threshold) && (B_n_cols >= threshold) && (mp_thread_limit::in_parallel() == false) )
+        {
+        gemm_emul_large_mp<do_trans_A, do_trans_B, use_alpha, use_beta>::apply(C,A,B,alpha,beta);
+        
+        return;
+        }
+      }
+    #endif
+    
     if( (do_trans_A == false) && (do_trans_B == false) )
       {
       arma_aligned podarray<eT> tmp(A_n_cols);
diff --git a/inst/include/current/armadillo_bits/mul_gemv.hpp b/inst/include/current/armadillo_bits/mul_gemv.hpp
@@ -203,6 +203,74 @@ struct gemv_emul_helper
 
 
 
+#if defined(ARMA_USE_OPENMP)
+//! Partial emulation of BLAS gemv().
+//! 'y' is assumed to have been set to the correct size (ie. taking into account the transpose)
+//! parallelised version
+template<const bool do_trans_A=false, const bool use_alpha=false, const bool use_beta=false>
+struct gemv_emul_mp
+  {
+  template<typename eT, typename TA>
+  arma_hot
+  inline
+  static
+  void
+  apply( eT* y, const TA& A, const eT* x, const eT alpha = eT(1), const eT beta = eT(0) )
+    {
+    arma_debug_sigprint();
+    
+    const int n_threads = mp_thread_limit::get();
+    
+    const uword A_n_rows = A.n_rows;
+    const uword A_n_cols = A.n_cols;
+    
+    if(do_trans_A == false)
+      {
+      #pragma omp parallel for schedule(static) num_threads(n_threads)
+      for(uword row=0; row < A_n_rows; ++row)
+        {
+        const eT acc = gemv_emul_helper::dot_row_col(A, x, row, A_n_cols);
+        
+             if( (use_alpha == false) && (use_beta == false) )  { y[row] =       acc;               }
+        else if( (use_alpha == true ) && (use_beta == false) )  { y[row] = alpha*acc;               }
+        else if( (use_alpha == false) && (use_beta == true ) )  { y[row] =       acc + beta*y[row]; }
+        else if( (use_alpha == true ) && (use_beta == true ) )  { y[row] = alpha*acc + beta*y[row]; }
+        }
+      }
+    else
+    if(do_trans_A == true)
+      {
+      if(is_cx<eT>::no)
+        {
+        #pragma omp parallel for schedule(static) num_threads(n_threads)
+        for(uword col=0; col < A_n_cols; ++col)
+          {
+          // col is interpreted as row when storing the results in 'y'
+          
+          const eT acc = op_dot::direct_dot(A_n_rows, A.colptr(col), x);
+          
+               if( (use_alpha == false) && (use_beta == false) )  { y[col] =       acc;               }
+          else if( (use_alpha == true ) && (use_beta == false) )  { y[col] = alpha*acc;               }
+          else if( (use_alpha == false) && (use_beta == true ) )  { y[col] =       acc + beta*y[col]; }
+          else if( (use_alpha == true ) && (use_beta == true ) )  { y[col] = alpha*acc + beta*y[col]; }
+          }
+        }
+      else
+        {
+        Mat<eT> AA;
+        
+        op_htrans::apply_mat_noalias(AA, A);
+        
+        gemv_emul_mp<false, use_alpha, use_beta>::apply(y, AA, x, alpha, beta);
+        }
+      }
+    }
+  
+  };
+#endif
+
+
+
 //! \brief
 //! Partial emulation of BLAS gemv().
 //! 'y' is assumed to have been set to the correct size (ie. taking into account the transpose)
@@ -222,6 +290,21 @@ struct gemv_emul
     const uword A_n_rows = A.n_rows;
     const uword A_n_cols = A.n_cols;
     
+    #if defined(ARMA_USE_OPENMP)
+      {
+      // TODO: replace with more sophisticated threshold mechanism
+      
+      constexpr uword threshold = uword(200);
+      
+      if( (A_n_rows >= threshold) && (A_n_cols >= threshold) && (mp_thread_limit::in_parallel() == false) )
+        {
+        gemv_emul_mp<do_trans_A, use_alpha, use_beta>::apply(y, A, x, alpha, beta);
+        
+        return;
+        }
+      }
+    #endif
+    
     if(do_trans_A == false)
       {
       if(A_n_rows == 1)
diff --git a/inst/include/current/armadillo_bits/op_vectorise_meat.hpp b/inst/include/current/armadillo_bits/op_vectorise_meat.hpp
@@ -365,14 +365,30 @@ op_vectorise_cube_col::apply_subview(Mat<eT>& out, const subview_cube<eT>& sv)
   
   out.set_size(sv.n_elem, 1);
   
+  if(sv.n_elem == 0)  { return; }
+  
   eT* out_mem = out.memptr();
   
-  for(uword s=0; s < sv_ns; ++s)
-  for(uword c=0; c < sv_nc; ++c)
+  if( (sv_nr == 1) && (sv_nc == 1) && (sv.aux_slice1 == 0) )
     {
-    arrayops::copy(out_mem, sv.slice_colptr(s,c), sv_nr);
+    const uword sv_m_n_elem_slice = sv.m.n_elem_slice;
+    
+    const eT* sv_m_ptr = &( sv.m.at(sv.aux_row1, sv.aux_col1, 0) );
     
-    out_mem += sv_nr;
+    for(uword s=0; s < sv_ns; ++s)
+      {
+      out_mem[s] = (*sv_m_ptr);  sv_m_ptr += sv_m_n_elem_slice;
+      }
+    }
+  else
+    {
+    for(uword s=0; s < sv_ns; ++s)
+    for(uword c=0; c < sv_nc; ++c)
+      {
+      arrayops::copy(out_mem, sv.slice_colptr(s,c), sv_nr);
+      
+      out_mem += sv_nr;
+      }
     }
   }