From d34c3e718a7903826065750d5306bcc8d36c475b Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Mon, 17 Nov 2025 11:21:43 +0000 Subject: [PATCH 1/7] WIP: initial translation of vectorized RGB->YCbCr from std::simd to NEON intrinsics --- src/lib.rs | 2 ++ src/neon.rs | 3 +++ src/neon/ycbcr.rs | 57 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 62 insertions(+) create mode 100644 src/neon.rs create mode 100644 src/neon/ycbcr.rs diff --git a/src/lib.rs b/src/lib.rs index 9fc0a0e..20679be 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -33,6 +33,8 @@ extern crate core; #[cfg(all(feature = "simd", any(target_arch = "x86", target_arch = "x86_64")))] mod avx2; +#[cfg(all(feature = "simd", target_feature = "neon"))] +mod neon; mod encoder; mod error; mod fdct; diff --git a/src/neon.rs b/src/neon.rs new file mode 100644 index 0000000..43b5e68 --- /dev/null +++ b/src/neon.rs @@ -0,0 +1,3 @@ +mod ycbcr; + +pub(crate) use ycbcr::*; diff --git a/src/neon/ycbcr.rs b/src/neon/ycbcr.rs new file mode 100644 index 0000000..ab3fc51 --- /dev/null +++ b/src/neon/ycbcr.rs @@ -0,0 +1,57 @@ +#[cfg(target_arch = "aarch64")] +use std::arch::aarch64::*; + +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[unsafe(no_mangle)] +#[inline(never)] +unsafe fn rgb_to_ycbcr_simd(r: int32x4_t, g: int32x4_t, b: int32x4_t) -> (int32x4_t, int32x4_t, int32x4_t) { + // To avoid floating point math this scales everything by 2^16 which gives + // a precision of approx 4 digits. + // + // Non scaled conversion: + // Y = 0.29900 * R + 0.58700 * G + 0.11400 * B + // Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 + // Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 + + let y1_mul = vdupq_n_s32(19595); + let y2_mul = vdupq_n_s32(38470); + let y3_mul = vdupq_n_s32(7471); + + let cb1_mul = vdupq_n_s32(-11059); + let cb2_mul = vdupq_n_s32(21709); + let cb3_mul = vdupq_n_s32(32768); + let cb4_mul = vdupq_n_s32(128 << 16); + + let cr1_mul = vdupq_n_s32(32768); + let cr2_mul = vdupq_n_s32(27439); + let cr3_mul = vdupq_n_s32(5329); + let cr4_mul = vdupq_n_s32(128 << 16); + + // Y = y1_mul * r + y2_mul * g + y3_mul * b + let y = vmlaq_s32(vmlaq_s32(vmulq_s32(y1_mul, r), y2_mul, g), y3_mul, b); + + // Cb = cb1_mul * r - cb2_mul * g + cb3_mul * b + cb4_mul + let cb = vaddq_s32( + vmlaq_s32(vmlsq_s32(vmulq_s32(cb1_mul, r), cb2_mul, g), cb3_mul, b), + cb4_mul + ); + + // Cr = cr1_mul * r - cr2_mul * g - cr3_mul * b + cr4_mul + let cr = vaddq_s32( + vmlsq_s32(vmlsq_s32(vmulq_s32(cr1_mul, r), cr2_mul, g), cr3_mul, b), + cr4_mul + ); + + #[inline(always)] + unsafe fn round_shift(v: int32x4_t) -> int32x4_t { + let round = vdupq_n_s32(0x7FFF); + vshrq_n_s32(vaddq_s32(v, round), 16) + } + + ( + round_shift(y), + round_shift(cb), + round_shift(cr) + ) +} \ No newline at end of file From ea673199d987bc4e1a4e96c9c1951384b157408d Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Mon, 17 Nov 2025 11:24:46 +0000 Subject: [PATCH 2/7] Widen the NEON function to take advantage of instruction-level parallelism and wide issue on recent Aarch64 --- src/neon/ycbcr.rs | 97 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 78 insertions(+), 19 deletions(-) diff --git a/src/neon/ycbcr.rs b/src/neon/ycbcr.rs index ab3fc51..c4eba39 100644 --- a/src/neon/ycbcr.rs +++ b/src/neon/ycbcr.rs @@ -5,7 +5,11 @@ use std::arch::aarch64::*; #[target_feature(enable = "neon")] #[unsafe(no_mangle)] #[inline(never)] -unsafe fn rgb_to_ycbcr_simd(r: int32x4_t, g: int32x4_t, b: int32x4_t) -> (int32x4_t, int32x4_t, int32x4_t) { +unsafe fn rgb_to_ycbcr_simd( + r: [i32; 8], + g: [i32; 8], + b: [i32; 8], +) -> ([i32; 8], [i32; 8], [i32; 8]) { // To avoid floating point math this scales everything by 2^16 which gives // a precision of approx 4 digits. // @@ -14,6 +18,14 @@ unsafe fn rgb_to_ycbcr_simd(r: int32x4_t, g: int32x4_t, b: int32x4_t) -> (int32x // Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 // Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 + // Load input arrays into NEON registers (2 registers per channel) + let r_lo = vld1q_s32(r.as_ptr()); + let r_hi = vld1q_s32(r.as_ptr().add(4)); + let g_lo = vld1q_s32(g.as_ptr()); + let g_hi = vld1q_s32(g.as_ptr().add(4)); + let b_lo = vld1q_s32(b.as_ptr()); + let b_hi = vld1q_s32(b.as_ptr().add(4)); + let y1_mul = vdupq_n_s32(19595); let y2_mul = vdupq_n_s32(38470); let y3_mul = vdupq_n_s32(7471); @@ -28,19 +40,50 @@ unsafe fn rgb_to_ycbcr_simd(r: int32x4_t, g: int32x4_t, b: int32x4_t) -> (int32x let cr3_mul = vdupq_n_s32(5329); let cr4_mul = vdupq_n_s32(128 << 16); - // Y = y1_mul * r + y2_mul * g + y3_mul * b - let y = vmlaq_s32(vmlaq_s32(vmulq_s32(y1_mul, r), y2_mul, g), y3_mul, b); - - // Cb = cb1_mul * r - cb2_mul * g + cb3_mul * b + cb4_mul - let cb = vaddq_s32( - vmlaq_s32(vmlsq_s32(vmulq_s32(cb1_mul, r), cb2_mul, g), cb3_mul, b), - cb4_mul + // Process low 4 elements + let y_lo = vmlaq_s32( + vmlaq_s32(vmulq_s32(y1_mul, r_lo), y2_mul, g_lo), + y3_mul, + b_lo, + ); + let cb_lo = vaddq_s32( + vmlaq_s32( + vmlsq_s32(vmulq_s32(cb1_mul, r_lo), cb2_mul, g_lo), + cb3_mul, + b_lo, + ), + cb4_mul, ); - - // Cr = cr1_mul * r - cr2_mul * g - cr3_mul * b + cr4_mul - let cr = vaddq_s32( - vmlsq_s32(vmlsq_s32(vmulq_s32(cr1_mul, r), cr2_mul, g), cr3_mul, b), - cr4_mul + let cr_lo = vaddq_s32( + vmlsq_s32( + vmlsq_s32(vmulq_s32(cr1_mul, r_lo), cr2_mul, g_lo), + cr3_mul, + b_lo, + ), + cr4_mul, + ); + + // Process high 4 elements + let y_hi = vmlaq_s32( + vmlaq_s32(vmulq_s32(y1_mul, r_hi), y2_mul, g_hi), + y3_mul, + b_hi, + ); + let cb_hi = vaddq_s32( + vmlaq_s32( + vmlsq_s32(vmulq_s32(cb1_mul, r_hi), cb2_mul, g_hi), + cb3_mul, + b_hi, + ), + cb4_mul, + ); + let cr_hi = vaddq_s32( + vmlsq_s32( + vmlsq_s32(vmulq_s32(cr1_mul, r_hi), cr2_mul, g_hi), + cr3_mul, + b_hi, + ), + cr4_mul, ); #[inline(always)] @@ -49,9 +92,25 @@ unsafe fn rgb_to_ycbcr_simd(r: int32x4_t, g: int32x4_t, b: int32x4_t) -> (int32x vshrq_n_s32(vaddq_s32(v, round), 16) } - ( - round_shift(y), - round_shift(cb), - round_shift(cr) - ) -} \ No newline at end of file + // Round and shift + let y_lo = round_shift(y_lo); + let y_hi = round_shift(y_hi); + let cb_lo = round_shift(cb_lo); + let cb_hi = round_shift(cb_hi); + let cr_lo = round_shift(cr_lo); + let cr_hi = round_shift(cr_hi); + + // Store results back to arrays + let mut y_out = [0i32; 8]; + let mut cb_out = [0i32; 8]; + let mut cr_out = [0i32; 8]; + + vst1q_s32(y_out.as_mut_ptr(), y_lo); + vst1q_s32(y_out.as_mut_ptr().add(4), y_hi); + vst1q_s32(cb_out.as_mut_ptr(), cb_lo); + vst1q_s32(cb_out.as_mut_ptr().add(4), cb_hi); + vst1q_s32(cr_out.as_mut_ptr(), cr_lo); + vst1q_s32(cr_out.as_mut_ptr().add(4), cr_hi); + + (y_out, cb_out, cr_out) +} From febf93aa5c6075ea39395e9667d61b97b7a83dd0 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Mon, 17 Nov 2025 11:44:04 +0000 Subject: [PATCH 3/7] Add safe wrappers for loads/stores, general safety pass --- src/neon/ycbcr.rs | 45 +++++++++++++++++++++++++++++++++------------ 1 file changed, 33 insertions(+), 12 deletions(-) diff --git a/src/neon/ycbcr.rs b/src/neon/ycbcr.rs index c4eba39..b3554ec 100644 --- a/src/neon/ycbcr.rs +++ b/src/neon/ycbcr.rs @@ -1,11 +1,11 @@ +#[cfg(target_arch = "arm")] // 32-bit ARM with NEON +use std::arch::arm::*; + #[cfg(target_arch = "aarch64")] use std::arch::aarch64::*; -#[cfg(target_arch = "aarch64")] #[target_feature(enable = "neon")] -#[unsafe(no_mangle)] -#[inline(never)] -unsafe fn rgb_to_ycbcr_simd( +fn rgb_to_ycbcr_simd( r: [i32; 8], g: [i32; 8], b: [i32; 8], @@ -19,12 +19,12 @@ unsafe fn rgb_to_ycbcr_simd( // Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 // Load input arrays into NEON registers (2 registers per channel) - let r_lo = vld1q_s32(r.as_ptr()); - let r_hi = vld1q_s32(r.as_ptr().add(4)); - let g_lo = vld1q_s32(g.as_ptr()); - let g_hi = vld1q_s32(g.as_ptr().add(4)); - let b_lo = vld1q_s32(b.as_ptr()); - let b_hi = vld1q_s32(b.as_ptr().add(4)); + let r_lo = load_i32x4(r[..4]); + let r_hi = load_i32x4(r[4..]); + let g_lo = load_i32x4(g[..4]); + let g_hi = load_i32x4(g[4..]); + let b_lo = load_i32x4(b[..4]); + let b_hi = load_i32x4(b[4..]); let y1_mul = vdupq_n_s32(19595); let y2_mul = vdupq_n_s32(38470); @@ -86,8 +86,9 @@ unsafe fn rgb_to_ycbcr_simd( cr4_mul, ); - #[inline(always)] - unsafe fn round_shift(v: int32x4_t) -> int32x4_t { + #[target_feature(enable = "neon")] + #[inline] + fn round_shift(v: int32x4_t) -> int32x4_t { let round = vdupq_n_s32(0x7FFF); vshrq_n_s32(vaddq_s32(v, round), 16) } @@ -114,3 +115,23 @@ unsafe fn rgb_to_ycbcr_simd( (y_out, cb_out, cr_out) } + +#[target_feature(enable = "neon")] +fn load_i32x4(arr: &[i32; 4]) -> int32x4_t { + // Safety preconditions. Optimized away in release mode, no runtime cost. + assert!(core::mem::size_of::() == core::mem::size_of::<[i32; 4]>()); + // SAFETY: size checked above. + // NEON load intrinsics do not care if data is aligned. + // Both types are plain old data: no pointers, lifetimes, etc. + vld1q_s32(arr.as_ptr()) +} + +#[target_feature(enable = "neon")] +fn store_i32x4(arr: &mut [i32], vec: int32x4_t) { + // Safety preconditions. Optimized away in release mode, no runtime cost. + assert!(arr.len() >= core::mem::size_of::()); + // SAFETY: size checked above. + // NEON load intrinsics do not care if data is aligned. + // Both types are plain old data: no pointers, lifetimes, etc. + vst1q_s32(arr.as_mut_ptr(), vec); +} From b45225aeca4db5a76c023c2ec009d50bb504cc1a Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Mon, 17 Nov 2025 12:03:04 +0000 Subject: [PATCH 4/7] Add a function to load u8 values to an i32 array with widening --- src/neon/ycbcr.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/neon/ycbcr.rs b/src/neon/ycbcr.rs index b3554ec..df1086f 100644 --- a/src/neon/ycbcr.rs +++ b/src/neon/ycbcr.rs @@ -135,3 +135,21 @@ fn store_i32x4(arr: &mut [i32], vec: int32x4_t) { // Both types are plain old data: no pointers, lifetimes, etc. vst1q_s32(arr.as_mut_ptr(), vec); } + +#[inline] +#[target_feature(enable = "neon")] +fn load_u8_to_i32(values: &[u8]) -> [i32; 8] { + // avoid bounds checks further down + let values = &values[..7*stride + 1]; + + [ + values[0 * stride] as i32, + values[1 * stride] as i32, + values[2 * stride] as i32, + values[3 * stride] as i32, + values[4 * stride] as i32, + values[5 * stride] as i32, + values[6 * stride] as i32, + values[7 * stride] as i32, + ] +} \ No newline at end of file From 59d76b1a9c9ec2221896aced8d3a9209a2660693 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Mon, 17 Nov 2025 12:04:07 +0000 Subject: [PATCH 5/7] Address compiler warning --- src/neon/ycbcr.rs | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/neon/ycbcr.rs b/src/neon/ycbcr.rs index df1086f..d09abbd 100644 --- a/src/neon/ycbcr.rs +++ b/src/neon/ycbcr.rs @@ -138,18 +138,18 @@ fn store_i32x4(arr: &mut [i32], vec: int32x4_t) { #[inline] #[target_feature(enable = "neon")] -fn load_u8_to_i32(values: &[u8]) -> [i32; 8] { +fn load_u8_to_i32(values: &[u8]) -> [i32; 8] { // avoid bounds checks further down - let values = &values[..7*stride + 1]; + let values = &values[..7*STRIDE + 1]; [ - values[0 * stride] as i32, - values[1 * stride] as i32, - values[2 * stride] as i32, - values[3 * stride] as i32, - values[4 * stride] as i32, - values[5 * stride] as i32, - values[6 * stride] as i32, - values[7 * stride] as i32, + values[0 * STRIDE] as i32, + values[1 * STRIDE] as i32, + values[2 * STRIDE] as i32, + values[3 * STRIDE] as i32, + values[4 * STRIDE] as i32, + values[5 * STRIDE] as i32, + values[6 * STRIDE] as i32, + values[7 * STRIDE] as i32, ] } \ No newline at end of file From 010ef9bbfb10bd7ad96ffa77d63122fc29434d39 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Mon, 17 Nov 2025 12:09:47 +0000 Subject: [PATCH 6/7] Fix build failures --- src/neon/ycbcr.rs | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/src/neon/ycbcr.rs b/src/neon/ycbcr.rs index d09abbd..1b58657 100644 --- a/src/neon/ycbcr.rs +++ b/src/neon/ycbcr.rs @@ -19,12 +19,12 @@ fn rgb_to_ycbcr_simd( // Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 // Load input arrays into NEON registers (2 registers per channel) - let r_lo = load_i32x4(r[..4]); - let r_hi = load_i32x4(r[4..]); - let g_lo = load_i32x4(g[..4]); - let g_hi = load_i32x4(g[4..]); - let b_lo = load_i32x4(b[..4]); - let b_hi = load_i32x4(b[4..]); + let r_lo = load_i32x4(r[..4].try_into().unwrap()); + let r_hi = load_i32x4(r[4..].try_into().unwrap()); + let g_lo = load_i32x4(g[..4].try_into().unwrap()); + let g_hi = load_i32x4(g[4..].try_into().unwrap()); + let b_lo = load_i32x4(b[..4].try_into().unwrap()); + let b_hi = load_i32x4(b[4..].try_into().unwrap()); let y1_mul = vdupq_n_s32(19595); let y2_mul = vdupq_n_s32(38470); @@ -106,12 +106,15 @@ fn rgb_to_ycbcr_simd( let mut cb_out = [0i32; 8]; let mut cr_out = [0i32; 8]; - vst1q_s32(y_out.as_mut_ptr(), y_lo); - vst1q_s32(y_out.as_mut_ptr().add(4), y_hi); - vst1q_s32(cb_out.as_mut_ptr(), cb_lo); - vst1q_s32(cb_out.as_mut_ptr().add(4), cb_hi); - vst1q_s32(cr_out.as_mut_ptr(), cr_lo); - vst1q_s32(cr_out.as_mut_ptr().add(4), cr_hi); + // TODO: refactor into safe stores + unsafe { + vst1q_s32(y_out.as_mut_ptr(), y_lo); + vst1q_s32(y_out.as_mut_ptr().add(4), y_hi); + vst1q_s32(cb_out.as_mut_ptr(), cb_lo); + vst1q_s32(cb_out.as_mut_ptr().add(4), cb_hi); + vst1q_s32(cr_out.as_mut_ptr(), cr_lo); + vst1q_s32(cr_out.as_mut_ptr().add(4), cr_hi); + } (y_out, cb_out, cr_out) } @@ -123,7 +126,7 @@ fn load_i32x4(arr: &[i32; 4]) -> int32x4_t { // SAFETY: size checked above. // NEON load intrinsics do not care if data is aligned. // Both types are plain old data: no pointers, lifetimes, etc. - vld1q_s32(arr.as_ptr()) + unsafe { vld1q_s32(arr.as_ptr()) } } #[target_feature(enable = "neon")] @@ -133,7 +136,7 @@ fn store_i32x4(arr: &mut [i32], vec: int32x4_t) { // SAFETY: size checked above. // NEON load intrinsics do not care if data is aligned. // Both types are plain old data: no pointers, lifetimes, etc. - vst1q_s32(arr.as_mut_ptr(), vec); + unsafe { vst1q_s32(arr.as_mut_ptr(), vec); } } #[inline] From 1b3631d0b5efd5b3c3c6efdda78a973536e2ee84 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Mon, 17 Nov 2025 12:35:40 +0000 Subject: [PATCH 7/7] Wire up all the loose functions to a struct --- src/neon/ycbcr.rs | 100 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 91 insertions(+), 9 deletions(-) diff --git a/src/neon/ycbcr.rs b/src/neon/ycbcr.rs index 1b58657..bd849d2 100644 --- a/src/neon/ycbcr.rs +++ b/src/neon/ycbcr.rs @@ -4,12 +4,90 @@ use std::arch::arm::*; #[cfg(target_arch = "aarch64")] use std::arch::aarch64::*; +use alloc::vec::Vec; + +use crate::{rgb_to_ycbcr, ImageBuffer, JpegColorType}; + +macro_rules! ycbcr_image_neon { + ($name:ident, $num_colors:expr, $o1:expr, $o2:expr, $o3:expr) => { + pub(crate) struct $name<'a>(pub &'a [u8], pub u16, pub u16); + + impl<'a> $name<'a> { + #[target_feature(enable = "neon")] + fn fill_buffers_neon(&self, y: u16, buffers: &mut [Vec; 4]) { + #[inline] + #[target_feature(enable = "neon")] + fn load3(data: &[u8], offset: usize) -> [i32; 8] { + load_channel::<3>(data, offset) + } + + let [y_buffer, cb_buffer, cr_buffer, _] = buffers; + y_buffer.reserve(self.width() as usize); + cb_buffer.reserve(self.width() as usize); + cr_buffer.reserve(self.width() as usize); + + let mut data = &self.0[(y as usize * self.1 as usize * $num_colors)..]; + + for _ in 0..self.width() / 8 { + let r = load3(&data[$o1..]); + let g = load3(&data[$o2..]); + let b = load3(&data[$o3..]); + + data = &data[($num_colors * 8)..]; + + let (y, cb, cr) = rgb_to_ycbcr_simd(r, g, b); + + let y: [u8; 8] = y.map(|x| x as u8); + y_buffer.extend_from_slice(&y); + + let cb: [u8; 8] = cb.map(|x| x as u8); + cb_buffer.extend_from_slice(&cb); + + let cr: [u8; 8] = cr.map(|x| x as u8); + cr_buffer.extend_from_slice(&cr); + } + + for _ in 0..self.width() % 8 { + let (y, cb, cr) = rgb_to_ycbcr(data[$o1], data[$o2], data[$o3]); + data = &data[$num_colors..]; + + y_buffer.push(y); + cb_buffer.push(cb); + cr_buffer.push(cr); + } + } + } + + impl<'a> ImageBuffer for $name<'a> { + fn get_jpeg_color_type(&self) -> JpegColorType { + JpegColorType::Ycbcr + } + + fn width(&self) -> u16 { + self.1 + } + + fn height(&self) -> u16 { + self.2 + } + + #[inline(always)] + fn fill_buffers(&self, y: u16, buffers: &mut [Vec; 4]) { + unsafe { + self.fill_buffers_neon(y, buffers); + } + } + } + }; +} + +ycbcr_image_neon!(RgbImageNeon, 3, 0, 1, 2); +ycbcr_image_neon!(RgbaImageNeon, 4, 0, 1, 2); +ycbcr_image_neon!(BgrImageNeon, 3, 2, 1, 0); +ycbcr_image_neon!(BgraImageNeon, 4, 2, 1, 0); + #[target_feature(enable = "neon")] -fn rgb_to_ycbcr_simd( - r: [i32; 8], - g: [i32; 8], - b: [i32; 8], -) -> ([i32; 8], [i32; 8], [i32; 8]) { +fn rgb_to_ycbcr_simd(r: [i32; 8], g: [i32; 8], b: [i32; 8]) -> ([i32; 8], [i32; 8], [i32; 8]) { // To avoid floating point math this scales everything by 2^16 which gives // a precision of approx 4 digits. // @@ -119,6 +197,7 @@ fn rgb_to_ycbcr_simd( (y_out, cb_out, cr_out) } +#[inline] #[target_feature(enable = "neon")] fn load_i32x4(arr: &[i32; 4]) -> int32x4_t { // Safety preconditions. Optimized away in release mode, no runtime cost. @@ -129,6 +208,7 @@ fn load_i32x4(arr: &[i32; 4]) -> int32x4_t { unsafe { vld1q_s32(arr.as_ptr()) } } +#[inline] #[target_feature(enable = "neon")] fn store_i32x4(arr: &mut [i32], vec: int32x4_t) { // Safety preconditions. Optimized away in release mode, no runtime cost. @@ -136,14 +216,16 @@ fn store_i32x4(arr: &mut [i32], vec: int32x4_t) { // SAFETY: size checked above. // NEON load intrinsics do not care if data is aligned. // Both types are plain old data: no pointers, lifetimes, etc. - unsafe { vst1q_s32(arr.as_mut_ptr(), vec); } + unsafe { + vst1q_s32(arr.as_mut_ptr(), vec); + } } #[inline] #[target_feature(enable = "neon")] -fn load_u8_to_i32(values: &[u8]) -> [i32; 8] { +fn load_channel(values: &[u8]) -> [i32; 8] { // avoid bounds checks further down - let values = &values[..7*STRIDE + 1]; + let values = &values[..7 * STRIDE + 1]; [ values[0 * STRIDE] as i32, @@ -155,4 +237,4 @@ fn load_u8_to_i32(values: &[u8]) -> [i32; 8] { values[6 * STRIDE] as i32, values[7 * STRIDE] as i32, ] -} \ No newline at end of file +}