From 92110c55e0c31e18b75f41469bf8c487d1d28357 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Wed, 29 Oct 2025 16:29:14 +0000 Subject: [PATCH 01/16] Bump MSRV to 1.86 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 28abe8f..6dacbbb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,7 +9,7 @@ categories = ["multimedia::images"] keywords = ["jpg", "jpeg", "encoder", "image"] readme = "README.md" repository = "https://github.com/vstroebel/jpeg-encoder" -rust-version = "1.61" +rust-version = "1.86" [features] default = ["std"] From 333c1010eb8a6fcb03aa30cddd6e8bba4425a54f Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Wed, 29 Oct 2025 16:31:01 +0000 Subject: [PATCH 02/16] AVX ycbcr: safety: always access the slice in &self with bounds checks instead of raw pointers --- src/avx2/ycbcr.rs | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/src/avx2/ycbcr.rs b/src/avx2/ycbcr.rs index 9f92294..be16943 100644 --- a/src/avx2/ycbcr.rs +++ b/src/avx2/ycbcr.rs @@ -21,16 +21,18 @@ macro_rules! ycbcr_image_avx2 { impl<'a> $name<'a> { #[target_feature(enable = "avx2")] unsafe fn fill_buffers_avx2(&self, y: u16, buffers: &mut [Vec; 4]) { - unsafe fn load3(data: *const u8) -> __m256i { + #[target_feature(enable = "avx2")] + fn load3(data: &[u8]) -> __m256i { + _ = data[7 * $num_colors]; // dummy indexing operation up front to avoid bounds checks later _mm256_set_epi32( - *data as i32, - *data.offset(1 * $num_colors) as i32, - *data.offset(2 * $num_colors) as i32, - *data.offset(3 * $num_colors) as i32, - *data.offset(4 * $num_colors) as i32, - *data.offset(5 * $num_colors) as i32, - *data.offset(6 * $num_colors) as i32, - *data.offset(7 * $num_colors) as i32, + data[0] as i32, + data[1 * $num_colors] as i32, + data[2 * $num_colors] as i32, + data[3 * $num_colors] as i32, + data[4 * $num_colors] as i32, + data[5 * $num_colors] as i32, + data[6 * $num_colors] as i32, + data[7 * $num_colors] as i32, ) } @@ -53,17 +55,16 @@ macro_rules! ycbcr_image_avx2 { let crmulg = _mm256_set1_epi32(27439); let crmulb = _mm256_set1_epi32(5329); - let mut data = self + let mut data = &self .0 - .as_ptr() - .offset((y as isize * self.1 as isize * $num_colors)); + [(y as usize * self.1 as usize * $num_colors)..]; for _ in 0..self.width() / 8 { - let r = load3(data.offset($o1)); - let g = load3(data.offset($o2)); - let b = load3(data.offset($o3)); + let r = load3(&data[$o1..]); + let g = load3(&data[$o2..]); + let b = load3(&data[$o3..]); - data = data.add($num_colors * 8); + data = &data[($num_colors * 8)..]; let yr = _mm256_mullo_epi32(ymulr, r); let yg = _mm256_mullo_epi32(ymulg, g); @@ -112,9 +113,9 @@ macro_rules! ycbcr_image_avx2 { for _ in 0..self.width() % 8 { let (y, cb, cr) = - rgb_to_ycbcr(*data.offset($o1), *data.offset($o2), *data.offset($o3)); + rgb_to_ycbcr(data[$o1], data[$o2], data[$o3]); - data = data.add($num_colors); + data = &data[$num_colors..]; *y_buffer = y; y_buffer = y_buffer.offset(1); From af429c70209315531486c977adb57776f8454e47 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Wed, 29 Oct 2025 16:43:50 +0000 Subject: [PATCH 03/16] Split the transmute into a helper function --- src/avx2/ycbcr.rs | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/src/avx2/ycbcr.rs b/src/avx2/ycbcr.rs index be16943..52722ac 100644 --- a/src/avx2/ycbcr.rs +++ b/src/avx2/ycbcr.rs @@ -21,6 +21,8 @@ macro_rules! ycbcr_image_avx2 { impl<'a> $name<'a> { #[target_feature(enable = "avx2")] unsafe fn fill_buffers_avx2(&self, y: u16, buffers: &mut [Vec; 4]) { + + #[inline] #[target_feature(enable = "avx2")] fn load3(data: &[u8]) -> __m256i { _ = data[7 * $num_colors]; // dummy indexing operation up front to avoid bounds checks later @@ -36,6 +38,17 @@ macro_rules! ycbcr_image_avx2 { ) } + #[inline] + #[target_feature(enable = "avx2")] + fn avx_as_i32_array(data: __m256i) -> [i32; 8] { + // Safety preconditions. Optimized away in release mode, no runtime cost. + assert!(core::mem::size_of::<__m256i>() == core::mem::size_of::<[i32; 8]>()); + assert!(core::mem::align_of::<__m256i>() >= core::mem::align_of::<[i32; 8]>()); + // SAFETY: size and alignment preconditions checked above. + // Both types are plain old data: no pointers, lifetimes, etc. + unsafe { core::mem::transmute(data) } + } + let mut y_buffer = buffers[0].as_mut_ptr().add(buffers[0].len()); buffers[0].set_len(buffers[0].len() + self.width() as usize); let mut cb_buffer = buffers[1].as_mut_ptr().add(buffers[1].len()); @@ -73,7 +86,7 @@ macro_rules! ycbcr_image_avx2 { let y = _mm256_add_epi32(_mm256_add_epi32(yr, yg), yb); let y = _mm256_add_epi32(y, _mm256_set1_epi32(0x7FFF)); let y = _mm256_srli_epi32(y, 16); - let y: [i32; 8] = core::mem::transmute(y); + let y: [i32; 8] = avx_as_i32_array(y); let cbr = _mm256_mullo_epi32(cbmulr, r); let cbg = _mm256_mullo_epi32(cbmulg, g); @@ -83,7 +96,7 @@ macro_rules! ycbcr_image_avx2 { let cb = _mm256_add_epi32(cb, _mm256_set1_epi32(128 << 16)); let cb = _mm256_add_epi32(cb, _mm256_set1_epi32(0x7FFF)); let cb = _mm256_srli_epi32(cb, 16); - let cb: [i32; 8] = core::mem::transmute(cb); + let cb: [i32; 8] = avx_as_i32_array(cb); let crr = _mm256_mullo_epi32(crmulr, r); let crg = _mm256_mullo_epi32(crmulg, g); @@ -93,7 +106,7 @@ macro_rules! ycbcr_image_avx2 { let cr = _mm256_add_epi32(cr, _mm256_set1_epi32(128 << 16)); let cr = _mm256_add_epi32(cr, _mm256_set1_epi32(0x7FFF)); let cr = _mm256_srli_epi32(cr, 16); - let cr: [i32; 8] = core::mem::transmute(cr); + let cr: [i32; 8] = avx_as_i32_array(cr); for y in y.iter().rev() { *y_buffer = *y as u8; From 04a0c2774f17a65556deed63171d6049fa1c1b71 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Wed, 29 Oct 2025 17:09:58 +0000 Subject: [PATCH 04/16] AVX ycbcr: safety: Always perform bounds checks on the ycbcr output buffers. Verify that there is enough capacity to hold the data before filling them to avoid buffer overflows. Treat them as MaybeUninit until they're filled to avoid exposining uninitialized memory. --- src/avx2/ycbcr.rs | 44 ++++++++++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/src/avx2/ycbcr.rs b/src/avx2/ycbcr.rs index 52722ac..b93a4e2 100644 --- a/src/avx2/ycbcr.rs +++ b/src/avx2/ycbcr.rs @@ -49,12 +49,11 @@ macro_rules! ycbcr_image_avx2 { unsafe { core::mem::transmute(data) } } - let mut y_buffer = buffers[0].as_mut_ptr().add(buffers[0].len()); - buffers[0].set_len(buffers[0].len() + self.width() as usize); - let mut cb_buffer = buffers[1].as_mut_ptr().add(buffers[1].len()); - buffers[1].set_len(buffers[1].len() + self.width() as usize); - let mut cr_buffer = buffers[2].as_mut_ptr().add(buffers[2].len()); - buffers[2].set_len(buffers[2].len() + self.width() as usize); + let [y_buffer, cb_buffer, cr_buffer, _] = buffers; + // if spare capacity is less than self.width(), slicing will fail here + let mut y_buffer = &mut y_buffer.spare_capacity_mut()[..self.width() as usize]; + let mut cb_buffer = &mut cb_buffer.spare_capacity_mut()[..self.width() as usize]; + let mut cr_buffer = &mut cr_buffer.spare_capacity_mut()[..self.width() as usize]; let ymulr = _mm256_set1_epi32(19595); let ymulg = _mm256_set1_epi32(38470); @@ -109,18 +108,18 @@ macro_rules! ycbcr_image_avx2 { let cr: [i32; 8] = avx_as_i32_array(cr); for y in y.iter().rev() { - *y_buffer = *y as u8; - y_buffer = y_buffer.offset(1); + y_buffer[0].write(*y as u8); + y_buffer = &mut y_buffer[1..]; } for cb in cb.iter().rev() { - *cb_buffer = *cb as u8; - cb_buffer = cb_buffer.offset(1); + cb_buffer[0].write(*cb as u8); + cb_buffer = &mut cb_buffer[1..]; } for cr in cr.iter().rev() { - *cr_buffer = *cr as u8; - cr_buffer = cr_buffer.offset(1); + cr_buffer[0].write(*cr as u8); + cr_buffer = &mut cr_buffer[1..]; } } @@ -130,14 +129,23 @@ macro_rules! ycbcr_image_avx2 { data = &data[$num_colors..]; - *y_buffer = y; - y_buffer = y_buffer.offset(1); + y_buffer[0].write(y); + y_buffer = &mut y_buffer[1..]; - *cb_buffer = cb; - cb_buffer = cb_buffer.offset(1); + cb_buffer[0].write(cb); + cb_buffer = &mut cb_buffer[1..]; - *cr_buffer = cr; - cr_buffer = cr_buffer.offset(1); + cr_buffer[0].write(cr); + cr_buffer = &mut cr_buffer[1..]; + } + + // SAFETY: we've just filled these with data. + // We do this at the end so that if the above code panics, and the panic is caught, + // the vectors will not have their length inflated yet and uninit memory will not be exposed + unsafe { + buffers[0].set_len(buffers[0].len() + self.width() as usize); + buffers[1].set_len(buffers[1].len() + self.width() as usize); + buffers[2].set_len(buffers[2].len() + self.width() as usize); } } } From 4a97518227068ab2ef98bb6acdc4bc383a456dd6 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Wed, 29 Oct 2025 17:11:44 +0000 Subject: [PATCH 05/16] AVX ycbcr: the function no longer needs to be `unsafe`, all unsafe operations are now wrapped in safe abstractions that verify preconditions --- src/avx2/ycbcr.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/avx2/ycbcr.rs b/src/avx2/ycbcr.rs index b93a4e2..f019510 100644 --- a/src/avx2/ycbcr.rs +++ b/src/avx2/ycbcr.rs @@ -20,7 +20,7 @@ macro_rules! ycbcr_image_avx2 { impl<'a> $name<'a> { #[target_feature(enable = "avx2")] - unsafe fn fill_buffers_avx2(&self, y: u16, buffers: &mut [Vec; 4]) { + fn fill_buffers_avx2(&self, y: u16, buffers: &mut [Vec; 4]) { #[inline] #[target_feature(enable = "avx2")] From 559000444cb1dd96553c6a5ed368d0ad138912ee Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Wed, 29 Oct 2025 18:42:48 +0000 Subject: [PATCH 06/16] refactor for slightly better codegen --- src/avx2/ycbcr.rs | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/src/avx2/ycbcr.rs b/src/avx2/ycbcr.rs index f019510..e3165cc 100644 --- a/src/avx2/ycbcr.rs +++ b/src/avx2/ycbcr.rs @@ -86,6 +86,13 @@ macro_rules! ycbcr_image_avx2 { let y = _mm256_add_epi32(y, _mm256_set1_epi32(0x7FFF)); let y = _mm256_srli_epi32(y, 16); let y: [i32; 8] = avx_as_i32_array(y); + let mut y: [u8; 8] = y.map(|x| x as u8); + y.reverse(); + + for (in_byte, out_byte) in y.iter().copied().zip(y_buffer[..8].iter_mut()) { + out_byte.write(in_byte); + } + y_buffer = &mut y_buffer[8..]; let cbr = _mm256_mullo_epi32(cbmulr, r); let cbg = _mm256_mullo_epi32(cbmulg, g); @@ -96,6 +103,13 @@ macro_rules! ycbcr_image_avx2 { let cb = _mm256_add_epi32(cb, _mm256_set1_epi32(0x7FFF)); let cb = _mm256_srli_epi32(cb, 16); let cb: [i32; 8] = avx_as_i32_array(cb); + let mut cb: [u8; 8] = cb.map(|x| x as u8); + cb.reverse(); + + for (in_byte, out_byte) in cb.iter().copied().zip(cb_buffer[..8].iter_mut()) { + out_byte.write(in_byte); + } + cb_buffer = &mut cb_buffer[8..]; let crr = _mm256_mullo_epi32(crmulr, r); let crg = _mm256_mullo_epi32(crmulg, g); @@ -106,21 +120,13 @@ macro_rules! ycbcr_image_avx2 { let cr = _mm256_add_epi32(cr, _mm256_set1_epi32(0x7FFF)); let cr = _mm256_srli_epi32(cr, 16); let cr: [i32; 8] = avx_as_i32_array(cr); + let mut cr: [u8; 8] = cr.map(|x| x as u8); + cr.reverse(); - for y in y.iter().rev() { - y_buffer[0].write(*y as u8); - y_buffer = &mut y_buffer[1..]; - } - - for cb in cb.iter().rev() { - cb_buffer[0].write(*cb as u8); - cb_buffer = &mut cb_buffer[1..]; - } - - for cr in cr.iter().rev() { - cr_buffer[0].write(*cr as u8); - cr_buffer = &mut cr_buffer[1..]; + for (in_byte, out_byte) in cr.iter().copied().zip(cr_buffer[..8].iter_mut()) { + out_byte.write(in_byte); } + cr_buffer = &mut cr_buffer[8..]; } for _ in 0..self.width() % 8 { From 5642f242ec33856c24c926566e12f91eb6a93f09 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Wed, 29 Oct 2025 19:00:35 +0000 Subject: [PATCH 07/16] cargo fmt --- src/avx2/ycbcr.rs | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/avx2/ycbcr.rs b/src/avx2/ycbcr.rs index e3165cc..0fc534f 100644 --- a/src/avx2/ycbcr.rs +++ b/src/avx2/ycbcr.rs @@ -21,7 +21,6 @@ macro_rules! ycbcr_image_avx2 { impl<'a> $name<'a> { #[target_feature(enable = "avx2")] fn fill_buffers_avx2(&self, y: u16, buffers: &mut [Vec; 4]) { - #[inline] #[target_feature(enable = "avx2")] fn load3(data: &[u8]) -> __m256i { @@ -67,9 +66,7 @@ macro_rules! ycbcr_image_avx2 { let crmulg = _mm256_set1_epi32(27439); let crmulb = _mm256_set1_epi32(5329); - let mut data = &self - .0 - [(y as usize * self.1 as usize * $num_colors)..]; + let mut data = &self.0[(y as usize * self.1 as usize * $num_colors)..]; for _ in 0..self.width() / 8 { let r = load3(&data[$o1..]); @@ -130,8 +127,7 @@ macro_rules! ycbcr_image_avx2 { } for _ in 0..self.width() % 8 { - let (y, cb, cr) = - rgb_to_ycbcr(data[$o1], data[$o2], data[$o3]); + let (y, cb, cr) = rgb_to_ycbcr(data[$o1], data[$o2], data[$o3]); data = &data[$num_colors..]; From df6173becc120584554d8f72d43c8efaca84750a Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Wed, 29 Oct 2025 19:22:20 +0000 Subject: [PATCH 08/16] expand safety comment --- src/avx2/ycbcr.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/avx2/ycbcr.rs b/src/avx2/ycbcr.rs index 0fc534f..c3b79a1 100644 --- a/src/avx2/ycbcr.rs +++ b/src/avx2/ycbcr.rs @@ -141,7 +141,10 @@ macro_rules! ycbcr_image_avx2 { cr_buffer = &mut cr_buffer[1..]; } - // SAFETY: we've just filled these with data. + // SAFETY: We know the buffers are long enough, + // otherwise `buffer.spare_capacity_mut()[..self.width()]` above would panic, + // and we've just filled these with data. + // // We do this at the end so that if the above code panics, and the panic is caught, // the vectors will not have their length inflated yet and uninit memory will not be exposed unsafe { From fc320d87ddf6ed824dd8a08ad85e93dee13c96c5 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Wed, 29 Oct 2025 20:31:55 +0000 Subject: [PATCH 09/16] Replace manual bookkeeping and unsafe{set_len()} with safe Vec methods --- src/avx2/ycbcr.rs | 49 +++++++++-------------------------------------- 1 file changed, 9 insertions(+), 40 deletions(-) diff --git a/src/avx2/ycbcr.rs b/src/avx2/ycbcr.rs index c3b79a1..8bf2b39 100644 --- a/src/avx2/ycbcr.rs +++ b/src/avx2/ycbcr.rs @@ -49,10 +49,9 @@ macro_rules! ycbcr_image_avx2 { } let [y_buffer, cb_buffer, cr_buffer, _] = buffers; - // if spare capacity is less than self.width(), slicing will fail here - let mut y_buffer = &mut y_buffer.spare_capacity_mut()[..self.width() as usize]; - let mut cb_buffer = &mut cb_buffer.spare_capacity_mut()[..self.width() as usize]; - let mut cr_buffer = &mut cr_buffer.spare_capacity_mut()[..self.width() as usize]; + y_buffer.reserve(self.width() as usize); + cb_buffer.reserve(self.width() as usize); + cr_buffer.reserve(self.width() as usize); let ymulr = _mm256_set1_epi32(19595); let ymulg = _mm256_set1_epi32(38470); @@ -85,11 +84,7 @@ macro_rules! ycbcr_image_avx2 { let y: [i32; 8] = avx_as_i32_array(y); let mut y: [u8; 8] = y.map(|x| x as u8); y.reverse(); - - for (in_byte, out_byte) in y.iter().copied().zip(y_buffer[..8].iter_mut()) { - out_byte.write(in_byte); - } - y_buffer = &mut y_buffer[8..]; + y_buffer.extend_from_slice(&y); let cbr = _mm256_mullo_epi32(cbmulr, r); let cbg = _mm256_mullo_epi32(cbmulg, g); @@ -102,11 +97,7 @@ macro_rules! ycbcr_image_avx2 { let cb: [i32; 8] = avx_as_i32_array(cb); let mut cb: [u8; 8] = cb.map(|x| x as u8); cb.reverse(); - - for (in_byte, out_byte) in cb.iter().copied().zip(cb_buffer[..8].iter_mut()) { - out_byte.write(in_byte); - } - cb_buffer = &mut cb_buffer[8..]; + cb_buffer.extend_from_slice(&cb); let crr = _mm256_mullo_epi32(crmulr, r); let crg = _mm256_mullo_epi32(crmulg, g); @@ -119,38 +110,16 @@ macro_rules! ycbcr_image_avx2 { let cr: [i32; 8] = avx_as_i32_array(cr); let mut cr: [u8; 8] = cr.map(|x| x as u8); cr.reverse(); - - for (in_byte, out_byte) in cr.iter().copied().zip(cr_buffer[..8].iter_mut()) { - out_byte.write(in_byte); - } - cr_buffer = &mut cr_buffer[8..]; + cr_buffer.extend_from_slice(&cr); } for _ in 0..self.width() % 8 { let (y, cb, cr) = rgb_to_ycbcr(data[$o1], data[$o2], data[$o3]); - data = &data[$num_colors..]; - y_buffer[0].write(y); - y_buffer = &mut y_buffer[1..]; - - cb_buffer[0].write(cb); - cb_buffer = &mut cb_buffer[1..]; - - cr_buffer[0].write(cr); - cr_buffer = &mut cr_buffer[1..]; - } - - // SAFETY: We know the buffers are long enough, - // otherwise `buffer.spare_capacity_mut()[..self.width()]` above would panic, - // and we've just filled these with data. - // - // We do this at the end so that if the above code panics, and the panic is caught, - // the vectors will not have their length inflated yet and uninit memory will not be exposed - unsafe { - buffers[0].set_len(buffers[0].len() + self.width() as usize); - buffers[1].set_len(buffers[1].len() + self.width() as usize); - buffers[2].set_len(buffers[2].len() + self.width() as usize); + y_buffer.push(y); + cb_buffer.push(cb); + cr_buffer.push(cr); } } } From f431b60cbc21e6826b774f3e734899b5aca93add Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Mon, 3 Nov 2025 12:57:38 +0000 Subject: [PATCH 10/16] Add a test verifying that AVX2 result is identical to scalar --- src/avx2/ycbcr.rs | 76 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/src/avx2/ycbcr.rs b/src/avx2/ycbcr.rs index 8bf2b39..f2b6723 100644 --- a/src/avx2/ycbcr.rs +++ b/src/avx2/ycbcr.rs @@ -151,3 +151,79 @@ ycbcr_image_avx2!(RgbImageAVX2, 3, 0, 1, 2); ycbcr_image_avx2!(RgbaImageAVX2, 4, 0, 1, 2); ycbcr_image_avx2!(BgrImageAVX2, 3, 2, 1, 0); ycbcr_image_avx2!(BgraImageAVX2, 4, 2, 1, 0); + +#[cfg(test)] +mod tests { + use super::*; + use std::vec::Vec; + + // A very basic linear congruential generator (LCG) to avoid external dependencies. + pub struct SimpleRng { + state: u64, + } + + impl SimpleRng { + /// Create a new RNG with a given seed. + pub fn new(seed: u64) -> Self { + Self { state: seed } + } + + /// Generate the next random u64 value. + pub fn next_u64(&mut self) -> u64 { + // Constants from Numerical Recipes + self.state = self.state.wrapping_mul(6364136223846793005).wrapping_add(1); + self.state + } + + /// Generate a random byte in 0..=255 + pub fn next_byte(&mut self) -> u8 { + (self.next_u64() & 0xFF) as u8 + } + + /// Fill a Vec with random bytes of the given length. + pub fn random_bytes(&mut self, len: usize) -> Vec { + (0..len).map(|_| self.next_byte()).collect() + } + } + + #[test] + #[cfg(feature = "simd")] + fn avx_matches_scalar_rgb() { + let mut rng = SimpleRng::new(42); + let width = 512 + 3; // power of two plus a bit to stress remainder handling + let height = 1; + let bpp = 3; + + let input = rng.random_bytes(width * height * bpp); // power of two plus a bit to exercise remainder handling + + let scalar_result: Vec<[u8; 3]> = input + .chunks_exact(bpp) + .map(|chunk| { + let [r, g, b, ..] = chunk else { unreachable!() }; + let (y, cb, cr) = rgb_to_ycbcr(*r, *g, *b); + [y, cb, cr] + }) + .collect(); + + let mut buffers = [Vec::new(), Vec::new(), Vec::new(), Vec::new()]; + let avx_input = RgbImageAVX2( + &input, + width.try_into().unwrap(), + height.try_into().unwrap(), + ); + unsafe { + let avx_result = avx_input.fill_buffers_avx2(0, &mut buffers); + } + + for i in 0..3 { + assert_eq!(buffers[i].len(), input.len() / 3); + } + + for (i, pixel) in scalar_result.iter().copied().enumerate() { + let avx_pixel: [u8; 3] = [buffers[0][i], buffers[1][i], buffers[2][i]]; + if pixel != avx_pixel { + panic!("Mismatch at index {i}: scalar result is {pixel:?}, avx result is {avx_pixel:?}"); + } + } + } +} From fb3f9d55cfb0b50e475955629f248cd0953f522f Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Mon, 3 Nov 2025 13:01:54 +0000 Subject: [PATCH 11/16] add a TODO --- src/avx2/ycbcr.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/avx2/ycbcr.rs b/src/avx2/ycbcr.rs index f2b6723..4498904 100644 --- a/src/avx2/ycbcr.rs +++ b/src/avx2/ycbcr.rs @@ -21,6 +21,7 @@ macro_rules! ycbcr_image_avx2 { impl<'a> $name<'a> { #[target_feature(enable = "avx2")] fn fill_buffers_avx2(&self, y: u16, buffers: &mut [Vec; 4]) { + // TODO: this compiles to many separate scalar loads and could be optimized further. #[inline] #[target_feature(enable = "avx2")] fn load3(data: &[u8]) -> __m256i { From 120c5bee0a1c5ce1b7b91649846209673dd0f445 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Mon, 3 Nov 2025 17:06:44 +0000 Subject: [PATCH 12/16] Add a note about scalar loads --- src/avx2/ycbcr.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/avx2/ycbcr.rs b/src/avx2/ycbcr.rs index 4498904..e2e8991 100644 --- a/src/avx2/ycbcr.rs +++ b/src/avx2/ycbcr.rs @@ -22,6 +22,7 @@ macro_rules! ycbcr_image_avx2 { #[target_feature(enable = "avx2")] fn fill_buffers_avx2(&self, y: u16, buffers: &mut [Vec; 4]) { // TODO: this compiles to many separate scalar loads and could be optimized further. + // But the gains are no more than 3% end to end and it doesn't seem to be worth the complexity. #[inline] #[target_feature(enable = "avx2")] fn load3(data: &[u8]) -> __m256i { From 8d2d64217644210ab75afdefdcba103d2ac896ef Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Wed, 5 Nov 2025 21:41:57 +0000 Subject: [PATCH 13/16] Don't attempt to run AVX2 test on machines without AVX2 --- src/avx2/ycbcr.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/avx2/ycbcr.rs b/src/avx2/ycbcr.rs index e2e8991..38cf1f9 100644 --- a/src/avx2/ycbcr.rs +++ b/src/avx2/ycbcr.rs @@ -191,6 +191,10 @@ mod tests { #[test] #[cfg(feature = "simd")] fn avx_matches_scalar_rgb() { + // Do not run AVX2 test on machines without it + if !std::is_x86_feature_detected!("avx2") { + return; + } let mut rng = SimpleRng::new(42); let width = 512 + 3; // power of two plus a bit to stress remainder handling let height = 1; @@ -213,6 +217,7 @@ mod tests { width.try_into().unwrap(), height.try_into().unwrap(), ); + // SAFETY: we've checked above that AVX2 is present unsafe { let avx_result = avx_input.fill_buffers_avx2(0, &mut buffers); } From c2b36b1d721455f47a19f27df07bde716c63afb7 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Wed, 5 Nov 2025 21:42:52 +0000 Subject: [PATCH 14/16] Address compiler warning --- src/avx2/ycbcr.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/avx2/ycbcr.rs b/src/avx2/ycbcr.rs index 38cf1f9..078edfa 100644 --- a/src/avx2/ycbcr.rs +++ b/src/avx2/ycbcr.rs @@ -219,7 +219,7 @@ mod tests { ); // SAFETY: we've checked above that AVX2 is present unsafe { - let avx_result = avx_input.fill_buffers_avx2(0, &mut buffers); + avx_input.fill_buffers_avx2(0, &mut buffers); } for i in 0..3 { From a32ab681dcfcc2ed7b54e67f83341fcd94521025 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Sat, 8 Nov 2025 09:37:32 +0000 Subject: [PATCH 15/16] bump MSRV --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 6dacbbb..779fd84 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,7 +9,7 @@ categories = ["multimedia::images"] keywords = ["jpg", "jpeg", "encoder", "image"] readme = "README.md" repository = "https://github.com/vstroebel/jpeg-encoder" -rust-version = "1.86" +rust-version = "1.87" [features] default = ["std"] From bf3ab88e7d50486a5f8b429b0fcd6b2654b98d8d Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Wed, 12 Nov 2025 23:11:37 +0000 Subject: [PATCH 16/16] bump github workflow version to match MSRV --- .github/workflows/rust.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 892c75d..50c1f55 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -16,7 +16,7 @@ jobs: strategy: matrix: - rust: ["1.61", stable, nightly] + rust: ["1.87", stable, nightly] steps: - name: Installing Rust toolchain @@ -36,7 +36,7 @@ jobs: strategy: matrix: - rust: [ "1.61", stable, nightly ] + rust: [ "1.87", stable, nightly ] steps: - name: Installing Rust toolchain