diff --git a/src/avx2/ycbcr.rs b/src/avx2/ycbcr.rs index 233419f..29d9d4f 100644 --- a/src/avx2/ycbcr.rs +++ b/src/avx2/ycbcr.rs @@ -20,26 +20,40 @@ macro_rules! ycbcr_image_avx2 { impl<'a> $name<'a> { #[target_feature(enable = "avx2")] - unsafe fn fill_buffers_avx2(&self, y: u16, buffers: &mut [Vec; 4]) { - unsafe fn load3(data: *const u8) -> __m256i { + fn fill_buffers_avx2(&self, y: u16, buffers: &mut [Vec; 4]) { + // TODO: this compiles to many separate scalar loads and could be optimized further. + // But the gains are no more than 3% end to end and it doesn't seem to be worth the complexity. + #[inline] + #[target_feature(enable = "avx2")] + fn load3(data: &[u8]) -> __m256i { + _ = data[7 * $num_colors]; // dummy indexing operation up front to avoid bounds checks later _mm256_set_epi32( - *data as i32, - *data.offset(1 * $num_colors) as i32, - *data.offset(2 * $num_colors) as i32, - *data.offset(3 * $num_colors) as i32, - *data.offset(4 * $num_colors) as i32, - *data.offset(5 * $num_colors) as i32, - *data.offset(6 * $num_colors) as i32, - *data.offset(7 * $num_colors) as i32, + data[0] as i32, + data[1 * $num_colors] as i32, + data[2 * $num_colors] as i32, + data[3 * $num_colors] as i32, + data[4 * $num_colors] as i32, + data[5 * $num_colors] as i32, + data[6 * $num_colors] as i32, + data[7 * $num_colors] as i32, ) } - let mut y_buffer = buffers[0].as_mut_ptr().add(buffers[0].len()); - buffers[0].set_len(buffers[0].len() + self.width() as usize); - let mut cb_buffer = buffers[1].as_mut_ptr().add(buffers[1].len()); - buffers[1].set_len(buffers[1].len() + self.width() as usize); - let mut cr_buffer = buffers[2].as_mut_ptr().add(buffers[2].len()); - buffers[2].set_len(buffers[2].len() + self.width() as usize); + #[inline] + #[target_feature(enable = "avx2")] + fn avx_as_i32_array(data: __m256i) -> [i32; 8] { + // Safety preconditions. Optimized away in release mode, no runtime cost. + assert!(core::mem::size_of::<__m256i>() == core::mem::size_of::<[i32; 8]>()); + assert!(core::mem::align_of::<__m256i>() >= core::mem::align_of::<[i32; 8]>()); + // SAFETY: size and alignment preconditions checked above. + // Both types are plain old data: no pointers, lifetimes, etc. + unsafe { core::mem::transmute(data) } + } + + let [y_buffer, cb_buffer, cr_buffer, _] = buffers; + y_buffer.reserve(self.width() as usize); + cb_buffer.reserve(self.width() as usize); + cr_buffer.reserve(self.width() as usize); let ymulr = _mm256_set1_epi32(19595); let ymulg = _mm256_set1_epi32(38470); @@ -53,17 +67,14 @@ macro_rules! ycbcr_image_avx2 { let crmulg = _mm256_set1_epi32(27439); let crmulb = _mm256_set1_epi32(5329); - let mut data = self - .0 - .as_ptr() - .offset((y as isize * self.1 as isize * $num_colors)); + let mut data = &self.0[(y as usize * self.1 as usize * $num_colors)..]; for _ in 0..self.width() / 8 { - let r = load3(data.offset($o1)); - let g = load3(data.offset($o2)); - let b = load3(data.offset($o3)); + let r = load3(&data[$o1..]); + let g = load3(&data[$o2..]); + let b = load3(&data[$o3..]); - data = data.add($num_colors * 8); + data = &data[($num_colors * 8)..]; let yr = _mm256_mullo_epi32(ymulr, r); let yg = _mm256_mullo_epi32(ymulg, g); @@ -72,7 +83,10 @@ macro_rules! ycbcr_image_avx2 { let y = _mm256_add_epi32(_mm256_add_epi32(yr, yg), yb); let y = _mm256_add_epi32(y, _mm256_set1_epi32(0x7FFF)); let y = _mm256_srli_epi32(y, 16); - let y: [i32; 8] = core::mem::transmute(y); + let y: [i32; 8] = avx_as_i32_array(y); + let mut y: [u8; 8] = y.map(|x| x as u8); + y.reverse(); + y_buffer.extend_from_slice(&y); let cbr = _mm256_mullo_epi32(cbmulr, r); let cbg = _mm256_mullo_epi32(cbmulg, g); @@ -82,7 +96,10 @@ macro_rules! ycbcr_image_avx2 { let cb = _mm256_add_epi32(cb, _mm256_set1_epi32(128 << 16)); let cb = _mm256_add_epi32(cb, _mm256_set1_epi32(0x7FFF)); let cb = _mm256_srli_epi32(cb, 16); - let cb: [i32; 8] = core::mem::transmute(cb); + let cb: [i32; 8] = avx_as_i32_array(cb); + let mut cb: [u8; 8] = cb.map(|x| x as u8); + cb.reverse(); + cb_buffer.extend_from_slice(&cb); let crr = _mm256_mullo_epi32(crmulr, r); let crg = _mm256_mullo_epi32(crmulg, g); @@ -92,38 +109,19 @@ macro_rules! ycbcr_image_avx2 { let cr = _mm256_add_epi32(cr, _mm256_set1_epi32(128 << 16)); let cr = _mm256_add_epi32(cr, _mm256_set1_epi32(0x7FFF)); let cr = _mm256_srli_epi32(cr, 16); - let cr: [i32; 8] = core::mem::transmute(cr); - - for y in y.iter().rev() { - *y_buffer = *y as u8; - y_buffer = y_buffer.offset(1); - } - - for cb in cb.iter().rev() { - *cb_buffer = *cb as u8; - cb_buffer = cb_buffer.offset(1); - } - - for cr in cr.iter().rev() { - *cr_buffer = *cr as u8; - cr_buffer = cr_buffer.offset(1); - } + let cr: [i32; 8] = avx_as_i32_array(cr); + let mut cr: [u8; 8] = cr.map(|x| x as u8); + cr.reverse(); + cr_buffer.extend_from_slice(&cr); } for _ in 0..self.width() % 8 { - let (y, cb, cr) = - rgb_to_ycbcr(*data.offset($o1), *data.offset($o2), *data.offset($o3)); - - data = data.add($num_colors); + let (y, cb, cr) = rgb_to_ycbcr(data[$o1], data[$o2], data[$o3]); + data = &data[$num_colors..]; - *y_buffer = y; - y_buffer = y_buffer.offset(1); - - *cb_buffer = cb; - cb_buffer = cb_buffer.offset(1); - - *cr_buffer = cr; - cr_buffer = cr_buffer.offset(1); + y_buffer.push(y); + cb_buffer.push(cb); + cr_buffer.push(cr); } } } @@ -155,3 +153,84 @@ ycbcr_image_avx2!(RgbImageAVX2, 3, 0, 1, 2); ycbcr_image_avx2!(RgbaImageAVX2, 4, 0, 1, 2); ycbcr_image_avx2!(BgrImageAVX2, 3, 2, 1, 0); ycbcr_image_avx2!(BgraImageAVX2, 4, 2, 1, 0); + +#[cfg(test)] +mod tests { + use super::*; + use std::vec::Vec; + + // A very basic linear congruential generator (LCG) to avoid external dependencies. + pub struct SimpleRng { + state: u64, + } + + impl SimpleRng { + /// Create a new RNG with a given seed. + pub fn new(seed: u64) -> Self { + Self { state: seed } + } + + /// Generate the next random u64 value. + pub fn next_u64(&mut self) -> u64 { + // Constants from Numerical Recipes + self.state = self.state.wrapping_mul(6364136223846793005).wrapping_add(1); + self.state + } + + /// Generate a random byte in 0..=255 + pub fn next_byte(&mut self) -> u8 { + (self.next_u64() & 0xFF) as u8 + } + + /// Fill a Vec with random bytes of the given length. + pub fn random_bytes(&mut self, len: usize) -> Vec { + (0..len).map(|_| self.next_byte()).collect() + } + } + + #[test] + #[cfg(feature = "simd")] + fn avx_matches_scalar_rgb() { + // Do not run AVX2 test on machines without it + if !std::is_x86_feature_detected!("avx2") { + return; + } + let mut rng = SimpleRng::new(42); + let width = 512 + 3; // power of two plus a bit to stress remainder handling + let height = 1; + let bpp = 3; + + let input = rng.random_bytes(width * height * bpp); // power of two plus a bit to exercise remainder handling + + let scalar_result: Vec<[u8; 3]> = input + .chunks_exact(bpp) + .map(|chunk| { + let [r, g, b, ..] = chunk else { unreachable!() }; + let (y, cb, cr) = rgb_to_ycbcr(*r, *g, *b); + [y, cb, cr] + }) + .collect(); + + let mut buffers = [Vec::new(), Vec::new(), Vec::new(), Vec::new()]; + let avx_input = RgbImageAVX2( + &input, + width.try_into().unwrap(), + height.try_into().unwrap(), + ); + // SAFETY: we've checked above that AVX2 is present + unsafe { + avx_input.fill_buffers_avx2(0, &mut buffers); + } + + for i in 0..3 { + assert_eq!(buffers[i].len(), input.len() / 3); + } + + for (i, pixel) in scalar_result.iter().copied().enumerate() { + let avx_pixel: [u8; 3] = [buffers[0][i], buffers[1][i], buffers[2][i]]; + if pixel != avx_pixel { + panic!("Mismatch at index {i}: scalar result is {pixel:?}, avx result is {avx_pixel:?}"); + } + } + } +}