From d34c3e718a7903826065750d5306bcc8d36c475b Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Mon, 17 Nov 2025 11:21:43 +0000
Subject: [PATCH 1/7] WIP: initial translation of vectorized RGB->YCbCr from
 std::simd to NEON intrinsics

---
 src/lib.rs        |  2 ++
 src/neon.rs       |  3 +++
 src/neon/ycbcr.rs | 57 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 62 insertions(+)
 create mode 100644 src/neon.rs
 create mode 100644 src/neon/ycbcr.rs

diff --git a/src/lib.rs b/src/lib.rs
index 9fc0a0e..20679be 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -33,6 +33,8 @@ extern crate core;
 
 #[cfg(all(feature = "simd", any(target_arch = "x86", target_arch = "x86_64")))]
 mod avx2;
+#[cfg(all(feature = "simd", target_feature = "neon"))]
+mod neon;
 mod encoder;
 mod error;
 mod fdct;
diff --git a/src/neon.rs b/src/neon.rs
new file mode 100644
index 0000000..43b5e68
--- /dev/null
+++ b/src/neon.rs
@@ -0,0 +1,3 @@
+mod ycbcr;
+
+pub(crate) use ycbcr::*;
diff --git a/src/neon/ycbcr.rs b/src/neon/ycbcr.rs
new file mode 100644
index 0000000..ab3fc51
--- /dev/null
+++ b/src/neon/ycbcr.rs
@@ -0,0 +1,57 @@
+#[cfg(target_arch = "aarch64")]
+use std::arch::aarch64::*;
+
+#[cfg(target_arch = "aarch64")]
+#[target_feature(enable = "neon")]
+#[unsafe(no_mangle)]
+#[inline(never)]
+unsafe fn rgb_to_ycbcr_simd(r: int32x4_t, g: int32x4_t, b: int32x4_t) -> (int32x4_t, int32x4_t, int32x4_t) {
+    // To avoid floating point math this scales everything by 2^16 which gives
+    // a precision of approx 4 digits.
+    //
+    // Non scaled conversion:
+    // Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+    // Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128
+    // Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B  + 128
+
+    let y1_mul = vdupq_n_s32(19595);
+    let y2_mul = vdupq_n_s32(38470);
+    let y3_mul = vdupq_n_s32(7471);
+
+    let cb1_mul = vdupq_n_s32(-11059);
+    let cb2_mul = vdupq_n_s32(21709);
+    let cb3_mul = vdupq_n_s32(32768);
+    let cb4_mul = vdupq_n_s32(128 << 16);
+
+    let cr1_mul = vdupq_n_s32(32768);
+    let cr2_mul = vdupq_n_s32(27439);
+    let cr3_mul = vdupq_n_s32(5329);
+    let cr4_mul = vdupq_n_s32(128 << 16);
+
+    // Y = y1_mul * r + y2_mul * g + y3_mul * b
+    let y = vmlaq_s32(vmlaq_s32(vmulq_s32(y1_mul, r), y2_mul, g), y3_mul, b);
+    
+    // Cb = cb1_mul * r - cb2_mul * g + cb3_mul * b + cb4_mul
+    let cb = vaddq_s32(
+        vmlaq_s32(vmlsq_s32(vmulq_s32(cb1_mul, r), cb2_mul, g), cb3_mul, b),
+        cb4_mul
+    );
+    
+    // Cr = cr1_mul * r - cr2_mul * g - cr3_mul * b + cr4_mul
+    let cr = vaddq_s32(
+        vmlsq_s32(vmlsq_s32(vmulq_s32(cr1_mul, r), cr2_mul, g), cr3_mul, b),
+        cr4_mul
+    );
+
+    #[inline(always)]
+    unsafe fn round_shift(v: int32x4_t) -> int32x4_t {
+        let round = vdupq_n_s32(0x7FFF);
+        vshrq_n_s32(vaddq_s32(v, round), 16)
+    }
+
+    (
+        round_shift(y),
+        round_shift(cb),
+        round_shift(cr)
+    )
+}
\ No newline at end of file

From ea673199d987bc4e1a4e96c9c1951384b157408d Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Mon, 17 Nov 2025 11:24:46 +0000
Subject: [PATCH 2/7] Widen the NEON function to take advantage of
 instruction-level parallelism and wide issue on recent Aarch64

---
 src/neon/ycbcr.rs | 97 +++++++++++++++++++++++++++++++++++++----------
 1 file changed, 78 insertions(+), 19 deletions(-)

diff --git a/src/neon/ycbcr.rs b/src/neon/ycbcr.rs
index ab3fc51..c4eba39 100644
--- a/src/neon/ycbcr.rs
+++ b/src/neon/ycbcr.rs
@@ -5,7 +5,11 @@ use std::arch::aarch64::*;
 #[target_feature(enable = "neon")]
 #[unsafe(no_mangle)]
 #[inline(never)]
-unsafe fn rgb_to_ycbcr_simd(r: int32x4_t, g: int32x4_t, b: int32x4_t) -> (int32x4_t, int32x4_t, int32x4_t) {
+unsafe fn rgb_to_ycbcr_simd(
+    r: [i32; 8],
+    g: [i32; 8],
+    b: [i32; 8],
+) -> ([i32; 8], [i32; 8], [i32; 8]) {
     // To avoid floating point math this scales everything by 2^16 which gives
     // a precision of approx 4 digits.
     //
@@ -14,6 +18,14 @@ unsafe fn rgb_to_ycbcr_simd(r: int32x4_t, g: int32x4_t, b: int32x4_t) -> (int32x
     // Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128
     // Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B  + 128
 
+    // Load input arrays into NEON registers (2 registers per channel)
+    let r_lo = vld1q_s32(r.as_ptr());
+    let r_hi = vld1q_s32(r.as_ptr().add(4));
+    let g_lo = vld1q_s32(g.as_ptr());
+    let g_hi = vld1q_s32(g.as_ptr().add(4));
+    let b_lo = vld1q_s32(b.as_ptr());
+    let b_hi = vld1q_s32(b.as_ptr().add(4));
+
     let y1_mul = vdupq_n_s32(19595);
     let y2_mul = vdupq_n_s32(38470);
     let y3_mul = vdupq_n_s32(7471);
@@ -28,19 +40,50 @@ unsafe fn rgb_to_ycbcr_simd(r: int32x4_t, g: int32x4_t, b: int32x4_t) -> (int32x
     let cr3_mul = vdupq_n_s32(5329);
     let cr4_mul = vdupq_n_s32(128 << 16);
 
-    // Y = y1_mul * r + y2_mul * g + y3_mul * b
-    let y = vmlaq_s32(vmlaq_s32(vmulq_s32(y1_mul, r), y2_mul, g), y3_mul, b);
-    
-    // Cb = cb1_mul * r - cb2_mul * g + cb3_mul * b + cb4_mul
-    let cb = vaddq_s32(
-        vmlaq_s32(vmlsq_s32(vmulq_s32(cb1_mul, r), cb2_mul, g), cb3_mul, b),
-        cb4_mul
+    // Process low 4 elements
+    let y_lo = vmlaq_s32(
+        vmlaq_s32(vmulq_s32(y1_mul, r_lo), y2_mul, g_lo),
+        y3_mul,
+        b_lo,
+    );
+    let cb_lo = vaddq_s32(
+        vmlaq_s32(
+            vmlsq_s32(vmulq_s32(cb1_mul, r_lo), cb2_mul, g_lo),
+            cb3_mul,
+            b_lo,
+        ),
+        cb4_mul,
     );
-    
-    // Cr = cr1_mul * r - cr2_mul * g - cr3_mul * b + cr4_mul
-    let cr = vaddq_s32(
-        vmlsq_s32(vmlsq_s32(vmulq_s32(cr1_mul, r), cr2_mul, g), cr3_mul, b),
-        cr4_mul
+    let cr_lo = vaddq_s32(
+        vmlsq_s32(
+            vmlsq_s32(vmulq_s32(cr1_mul, r_lo), cr2_mul, g_lo),
+            cr3_mul,
+            b_lo,
+        ),
+        cr4_mul,
+    );
+
+    // Process high 4 elements
+    let y_hi = vmlaq_s32(
+        vmlaq_s32(vmulq_s32(y1_mul, r_hi), y2_mul, g_hi),
+        y3_mul,
+        b_hi,
+    );
+    let cb_hi = vaddq_s32(
+        vmlaq_s32(
+            vmlsq_s32(vmulq_s32(cb1_mul, r_hi), cb2_mul, g_hi),
+            cb3_mul,
+            b_hi,
+        ),
+        cb4_mul,
+    );
+    let cr_hi = vaddq_s32(
+        vmlsq_s32(
+            vmlsq_s32(vmulq_s32(cr1_mul, r_hi), cr2_mul, g_hi),
+            cr3_mul,
+            b_hi,
+        ),
+        cr4_mul,
     );
 
     #[inline(always)]
@@ -49,9 +92,25 @@ unsafe fn rgb_to_ycbcr_simd(r: int32x4_t, g: int32x4_t, b: int32x4_t) -> (int32x
         vshrq_n_s32(vaddq_s32(v, round), 16)
     }
 
-    (
-        round_shift(y),
-        round_shift(cb),
-        round_shift(cr)
-    )
-}
\ No newline at end of file
+    // Round and shift
+    let y_lo = round_shift(y_lo);
+    let y_hi = round_shift(y_hi);
+    let cb_lo = round_shift(cb_lo);
+    let cb_hi = round_shift(cb_hi);
+    let cr_lo = round_shift(cr_lo);
+    let cr_hi = round_shift(cr_hi);
+
+    // Store results back to arrays
+    let mut y_out = [0i32; 8];
+    let mut cb_out = [0i32; 8];
+    let mut cr_out = [0i32; 8];
+
+    vst1q_s32(y_out.as_mut_ptr(), y_lo);
+    vst1q_s32(y_out.as_mut_ptr().add(4), y_hi);
+    vst1q_s32(cb_out.as_mut_ptr(), cb_lo);
+    vst1q_s32(cb_out.as_mut_ptr().add(4), cb_hi);
+    vst1q_s32(cr_out.as_mut_ptr(), cr_lo);
+    vst1q_s32(cr_out.as_mut_ptr().add(4), cr_hi);
+
+    (y_out, cb_out, cr_out)
+}

From febf93aa5c6075ea39395e9667d61b97b7a83dd0 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Mon, 17 Nov 2025 11:44:04 +0000
Subject: [PATCH 3/7] Add safe wrappers for loads/stores, general safety pass

---
 src/neon/ycbcr.rs | 45 +++++++++++++++++++++++++++++++++------------
 1 file changed, 33 insertions(+), 12 deletions(-)

diff --git a/src/neon/ycbcr.rs b/src/neon/ycbcr.rs
index c4eba39..b3554ec 100644
--- a/src/neon/ycbcr.rs
+++ b/src/neon/ycbcr.rs
@@ -1,11 +1,11 @@
+#[cfg(target_arch = "arm")] // 32-bit ARM  with NEON
+use std::arch::arm::*;
+
 #[cfg(target_arch = "aarch64")]
 use std::arch::aarch64::*;
 
-#[cfg(target_arch = "aarch64")]
 #[target_feature(enable = "neon")]
-#[unsafe(no_mangle)]
-#[inline(never)]
-unsafe fn rgb_to_ycbcr_simd(
+fn rgb_to_ycbcr_simd(
     r: [i32; 8],
     g: [i32; 8],
     b: [i32; 8],
@@ -19,12 +19,12 @@ unsafe fn rgb_to_ycbcr_simd(
     // Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B  + 128
 
     // Load input arrays into NEON registers (2 registers per channel)
-    let r_lo = vld1q_s32(r.as_ptr());
-    let r_hi = vld1q_s32(r.as_ptr().add(4));
-    let g_lo = vld1q_s32(g.as_ptr());
-    let g_hi = vld1q_s32(g.as_ptr().add(4));
-    let b_lo = vld1q_s32(b.as_ptr());
-    let b_hi = vld1q_s32(b.as_ptr().add(4));
+    let r_lo = load_i32x4(r[..4]);
+    let r_hi = load_i32x4(r[4..]);
+    let g_lo = load_i32x4(g[..4]);
+    let g_hi = load_i32x4(g[4..]);
+    let b_lo = load_i32x4(b[..4]);
+    let b_hi = load_i32x4(b[4..]);
 
     let y1_mul = vdupq_n_s32(19595);
     let y2_mul = vdupq_n_s32(38470);
@@ -86,8 +86,9 @@ unsafe fn rgb_to_ycbcr_simd(
         cr4_mul,
     );
 
-    #[inline(always)]
-    unsafe fn round_shift(v: int32x4_t) -> int32x4_t {
+    #[target_feature(enable = "neon")]
+    #[inline]
+    fn round_shift(v: int32x4_t) -> int32x4_t {
         let round = vdupq_n_s32(0x7FFF);
         vshrq_n_s32(vaddq_s32(v, round), 16)
     }
@@ -114,3 +115,23 @@ unsafe fn rgb_to_ycbcr_simd(
 
     (y_out, cb_out, cr_out)
 }
+
+#[target_feature(enable = "neon")]
+fn load_i32x4(arr: &[i32; 4]) -> int32x4_t {
+    // Safety preconditions. Optimized away in release mode, no runtime cost.
+    assert!(core::mem::size_of::<int32x4_t>() == core::mem::size_of::<[i32; 4]>());
+    // SAFETY: size checked above.
+    // NEON load intrinsics do not care if data is aligned.
+    // Both types are plain old data: no pointers, lifetimes, etc.
+    vld1q_s32(arr.as_ptr())
+}
+
+#[target_feature(enable = "neon")]
+fn store_i32x4(arr: &mut [i32], vec: int32x4_t) {
+    // Safety preconditions. Optimized away in release mode, no runtime cost.
+    assert!(arr.len() >= core::mem::size_of::<int32x4_t>());
+    // SAFETY: size checked above.
+    // NEON load intrinsics do not care if data is aligned.
+    // Both types are plain old data: no pointers, lifetimes, etc.
+    vst1q_s32(arr.as_mut_ptr(), vec);
+}

From b45225aeca4db5a76c023c2ec009d50bb504cc1a Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Mon, 17 Nov 2025 12:03:04 +0000
Subject: [PATCH 4/7] Add a function to load u8 values to an i32 array with
 widening

---
 src/neon/ycbcr.rs | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/src/neon/ycbcr.rs b/src/neon/ycbcr.rs
index b3554ec..df1086f 100644
--- a/src/neon/ycbcr.rs
+++ b/src/neon/ycbcr.rs
@@ -135,3 +135,21 @@ fn store_i32x4(arr: &mut [i32], vec: int32x4_t) {
     // Both types are plain old data: no pointers, lifetimes, etc.
     vst1q_s32(arr.as_mut_ptr(), vec);
 }
+
+#[inline]
+#[target_feature(enable = "neon")]
+fn load_u8_to_i32<const stride: usize>(values: &[u8]) -> [i32; 8] {
+    // avoid bounds checks further down
+    let values = &values[..7*stride + 1];
+
+    [
+        values[0 * stride] as i32,
+        values[1 * stride] as i32,
+        values[2 * stride] as i32,
+        values[3 * stride] as i32,
+        values[4 * stride] as i32,
+        values[5 * stride] as i32,
+        values[6 * stride] as i32,
+        values[7 * stride] as i32,
+    ]
+}
\ No newline at end of file

From 59d76b1a9c9ec2221896aced8d3a9209a2660693 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Mon, 17 Nov 2025 12:04:07 +0000
Subject: [PATCH 5/7] Address compiler warning

---
 src/neon/ycbcr.rs | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/neon/ycbcr.rs b/src/neon/ycbcr.rs
index df1086f..d09abbd 100644
--- a/src/neon/ycbcr.rs
+++ b/src/neon/ycbcr.rs
@@ -138,18 +138,18 @@ fn store_i32x4(arr: &mut [i32], vec: int32x4_t) {
 
 #[inline]
 #[target_feature(enable = "neon")]
-fn load_u8_to_i32<const stride: usize>(values: &[u8]) -> [i32; 8] {
+fn load_u8_to_i32<const STRIDE: usize>(values: &[u8]) -> [i32; 8] {
     // avoid bounds checks further down
-    let values = &values[..7*stride + 1];
+    let values = &values[..7*STRIDE + 1];
 
     [
-        values[0 * stride] as i32,
-        values[1 * stride] as i32,
-        values[2 * stride] as i32,
-        values[3 * stride] as i32,
-        values[4 * stride] as i32,
-        values[5 * stride] as i32,
-        values[6 * stride] as i32,
-        values[7 * stride] as i32,
+        values[0 * STRIDE] as i32,
+        values[1 * STRIDE] as i32,
+        values[2 * STRIDE] as i32,
+        values[3 * STRIDE] as i32,
+        values[4 * STRIDE] as i32,
+        values[5 * STRIDE] as i32,
+        values[6 * STRIDE] as i32,
+        values[7 * STRIDE] as i32,
     ]
 }
\ No newline at end of file

From 010ef9bbfb10bd7ad96ffa77d63122fc29434d39 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Mon, 17 Nov 2025 12:09:47 +0000
Subject: [PATCH 6/7] Fix build failures

---
 src/neon/ycbcr.rs | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/src/neon/ycbcr.rs b/src/neon/ycbcr.rs
index d09abbd..1b58657 100644
--- a/src/neon/ycbcr.rs
+++ b/src/neon/ycbcr.rs
@@ -19,12 +19,12 @@ fn rgb_to_ycbcr_simd(
     // Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B  + 128
 
     // Load input arrays into NEON registers (2 registers per channel)
-    let r_lo = load_i32x4(r[..4]);
-    let r_hi = load_i32x4(r[4..]);
-    let g_lo = load_i32x4(g[..4]);
-    let g_hi = load_i32x4(g[4..]);
-    let b_lo = load_i32x4(b[..4]);
-    let b_hi = load_i32x4(b[4..]);
+    let r_lo = load_i32x4(r[..4].try_into().unwrap());
+    let r_hi = load_i32x4(r[4..].try_into().unwrap());
+    let g_lo = load_i32x4(g[..4].try_into().unwrap());
+    let g_hi = load_i32x4(g[4..].try_into().unwrap());
+    let b_lo = load_i32x4(b[..4].try_into().unwrap());
+    let b_hi = load_i32x4(b[4..].try_into().unwrap());
 
     let y1_mul = vdupq_n_s32(19595);
     let y2_mul = vdupq_n_s32(38470);
@@ -106,12 +106,15 @@ fn rgb_to_ycbcr_simd(
     let mut cb_out = [0i32; 8];
     let mut cr_out = [0i32; 8];
 
-    vst1q_s32(y_out.as_mut_ptr(), y_lo);
-    vst1q_s32(y_out.as_mut_ptr().add(4), y_hi);
-    vst1q_s32(cb_out.as_mut_ptr(), cb_lo);
-    vst1q_s32(cb_out.as_mut_ptr().add(4), cb_hi);
-    vst1q_s32(cr_out.as_mut_ptr(), cr_lo);
-    vst1q_s32(cr_out.as_mut_ptr().add(4), cr_hi);
+    // TODO: refactor into safe stores
+    unsafe {
+        vst1q_s32(y_out.as_mut_ptr(), y_lo);
+        vst1q_s32(y_out.as_mut_ptr().add(4), y_hi);
+        vst1q_s32(cb_out.as_mut_ptr(), cb_lo);
+        vst1q_s32(cb_out.as_mut_ptr().add(4), cb_hi);
+        vst1q_s32(cr_out.as_mut_ptr(), cr_lo);
+        vst1q_s32(cr_out.as_mut_ptr().add(4), cr_hi);
+    }
 
     (y_out, cb_out, cr_out)
 }
@@ -123,7 +126,7 @@ fn load_i32x4(arr: &[i32; 4]) -> int32x4_t {
     // SAFETY: size checked above.
     // NEON load intrinsics do not care if data is aligned.
     // Both types are plain old data: no pointers, lifetimes, etc.
-    vld1q_s32(arr.as_ptr())
+    unsafe { vld1q_s32(arr.as_ptr()) }
 }
 
 #[target_feature(enable = "neon")]
@@ -133,7 +136,7 @@ fn store_i32x4(arr: &mut [i32], vec: int32x4_t) {
     // SAFETY: size checked above.
     // NEON load intrinsics do not care if data is aligned.
     // Both types are plain old data: no pointers, lifetimes, etc.
-    vst1q_s32(arr.as_mut_ptr(), vec);
+    unsafe { vst1q_s32(arr.as_mut_ptr(), vec); }
 }
 
 #[inline]

From 1b3631d0b5efd5b3c3c6efdda78a973536e2ee84 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Mon, 17 Nov 2025 12:35:40 +0000
Subject: [PATCH 7/7] Wire up all the loose functions to a struct

---
 src/neon/ycbcr.rs | 100 +++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 91 insertions(+), 9 deletions(-)

diff --git a/src/neon/ycbcr.rs b/src/neon/ycbcr.rs
index 1b58657..bd849d2 100644
--- a/src/neon/ycbcr.rs
+++ b/src/neon/ycbcr.rs
@@ -4,12 +4,90 @@ use std::arch::arm::*;
 #[cfg(target_arch = "aarch64")]
 use std::arch::aarch64::*;
 
+use alloc::vec::Vec;
+
+use crate::{rgb_to_ycbcr, ImageBuffer, JpegColorType};
+
+macro_rules! ycbcr_image_neon {
+    ($name:ident, $num_colors:expr, $o1:expr, $o2:expr, $o3:expr) => {
+        pub(crate) struct $name<'a>(pub &'a [u8], pub u16, pub u16);
+
+        impl<'a> $name<'a> {
+            #[target_feature(enable = "neon")]
+            fn fill_buffers_neon(&self, y: u16, buffers: &mut [Vec<u8>; 4]) {
+                #[inline]
+                #[target_feature(enable = "neon")]
+                fn load3(data: &[u8], offset: usize) -> [i32; 8] {
+                    load_channel::<3>(data, offset)
+                }
+
+                let [y_buffer, cb_buffer, cr_buffer, _] = buffers;
+                y_buffer.reserve(self.width() as usize);
+                cb_buffer.reserve(self.width() as usize);
+                cr_buffer.reserve(self.width() as usize);
+
+                let mut data = &self.0[(y as usize * self.1 as usize * $num_colors)..];
+
+                for _ in 0..self.width() / 8 {
+                    let r = load3(&data[$o1..]);
+                    let g = load3(&data[$o2..]);
+                    let b = load3(&data[$o3..]);
+
+                    data = &data[($num_colors * 8)..];
+
+                    let (y, cb, cr) = rgb_to_ycbcr_simd(r, g, b);
+
+                    let y: [u8; 8] = y.map(|x| x as u8);
+                    y_buffer.extend_from_slice(&y);
+
+                    let cb: [u8; 8] = cb.map(|x| x as u8);
+                    cb_buffer.extend_from_slice(&cb);
+
+                    let cr: [u8; 8] = cr.map(|x| x as u8);
+                    cr_buffer.extend_from_slice(&cr);
+                }
+
+                for _ in 0..self.width() % 8 {
+                    let (y, cb, cr) = rgb_to_ycbcr(data[$o1], data[$o2], data[$o3]);
+                    data = &data[$num_colors..];
+
+                    y_buffer.push(y);
+                    cb_buffer.push(cb);
+                    cr_buffer.push(cr);
+                }
+            }
+        }
+
+        impl<'a> ImageBuffer for $name<'a> {
+            fn get_jpeg_color_type(&self) -> JpegColorType {
+                JpegColorType::Ycbcr
+            }
+
+            fn width(&self) -> u16 {
+                self.1
+            }
+
+            fn height(&self) -> u16 {
+                self.2
+            }
+
+            #[inline(always)]
+            fn fill_buffers(&self, y: u16, buffers: &mut [Vec<u8>; 4]) {
+                unsafe {
+                    self.fill_buffers_neon(y, buffers);
+                }
+            }
+        }
+    };
+}
+
+ycbcr_image_neon!(RgbImageNeon, 3, 0, 1, 2);
+ycbcr_image_neon!(RgbaImageNeon, 4, 0, 1, 2);
+ycbcr_image_neon!(BgrImageNeon, 3, 2, 1, 0);
+ycbcr_image_neon!(BgraImageNeon, 4, 2, 1, 0);
+
 #[target_feature(enable = "neon")]
-fn rgb_to_ycbcr_simd(
-    r: [i32; 8],
-    g: [i32; 8],
-    b: [i32; 8],
-) -> ([i32; 8], [i32; 8], [i32; 8]) {
+fn rgb_to_ycbcr_simd(r: [i32; 8], g: [i32; 8], b: [i32; 8]) -> ([i32; 8], [i32; 8], [i32; 8]) {
     // To avoid floating point math this scales everything by 2^16 which gives
     // a precision of approx 4 digits.
     //
@@ -119,6 +197,7 @@ fn rgb_to_ycbcr_simd(
     (y_out, cb_out, cr_out)
 }
 
+#[inline]
 #[target_feature(enable = "neon")]
 fn load_i32x4(arr: &[i32; 4]) -> int32x4_t {
     // Safety preconditions. Optimized away in release mode, no runtime cost.
@@ -129,6 +208,7 @@ fn load_i32x4(arr: &[i32; 4]) -> int32x4_t {
     unsafe { vld1q_s32(arr.as_ptr()) }
 }
 
+#[inline]
 #[target_feature(enable = "neon")]
 fn store_i32x4(arr: &mut [i32], vec: int32x4_t) {
     // Safety preconditions. Optimized away in release mode, no runtime cost.
@@ -136,14 +216,16 @@ fn store_i32x4(arr: &mut [i32], vec: int32x4_t) {
     // SAFETY: size checked above.
     // NEON load intrinsics do not care if data is aligned.
     // Both types are plain old data: no pointers, lifetimes, etc.
-    unsafe { vst1q_s32(arr.as_mut_ptr(), vec); }
+    unsafe {
+        vst1q_s32(arr.as_mut_ptr(), vec);
+    }
 }
 
 #[inline]
 #[target_feature(enable = "neon")]
-fn load_u8_to_i32<const STRIDE: usize>(values: &[u8]) -> [i32; 8] {
+fn load_channel<const STRIDE: usize>(values: &[u8]) -> [i32; 8] {
     // avoid bounds checks further down
-    let values = &values[..7*STRIDE + 1];
+    let values = &values[..7 * STRIDE + 1];
 
     [
         values[0 * STRIDE] as i32,
@@ -155,4 +237,4 @@ fn load_u8_to_i32<const STRIDE: usize>(values: &[u8]) -> [i32; 8] {
         values[6 * STRIDE] as i32,
         values[7 * STRIDE] as i32,
     ]
-}
\ No newline at end of file
+}