Skip to content

Commit ae27b16

Browse files
committed
Explicitly vectorize i32x8 to u8x8 conversion for storing into YCbCr
Remove unsafe transmute in AVX2 YCbCr conversion Prior to this, the conversion was scalarized and used a bswap. Rewriting the code to avoid reversing the array resulted in worse codegen that extracted the bytes and manually re-inserted them back into the SIMD register to store 8 bytes at once.
1 parent 120c5be commit ae27b16

File tree

1 file changed

+29
-27
lines changed

1 file changed

+29
-27
lines changed

src/avx2/ycbcr.rs

Lines changed: 29 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,15 @@
11
#[cfg(target_arch = "x86")]
22
use core::arch::x86::{
3-
__m256i, _mm256_add_epi32, _mm256_mullo_epi32, _mm256_set1_epi32, _mm256_set_epi32,
4-
_mm256_srli_epi32, _mm256_sub_epi32,
3+
__m256i, _mm256_add_epi32, _mm256_castsi256_si128, _mm256_mullo_epi32,
4+
_mm256_permutevar8x32_epi32, _mm256_set1_epi32, _mm256_setr_epi32, _mm256_setr_epi8,
5+
_mm256_shuffle_epi8, _mm256_sub_epi32, _mm_cvtsi128_si64,
56
};
67

78
#[cfg(target_arch = "x86_64")]
89
use core::arch::x86_64::{
9-
__m256i, _mm256_add_epi32, _mm256_mullo_epi32, _mm256_set1_epi32, _mm256_set_epi32,
10-
_mm256_srli_epi32, _mm256_sub_epi32,
10+
__m256i, _mm256_add_epi32, _mm256_castsi256_si128, _mm256_mullo_epi32,
11+
_mm256_permutevar8x32_epi32, _mm256_set1_epi32, _mm256_setr_epi32, _mm256_setr_epi8,
12+
_mm256_shuffle_epi8, _mm256_sub_epi32, _mm_cvtsi128_si64,
1113
};
1214

1315
use alloc::vec::Vec;
@@ -27,7 +29,7 @@ macro_rules! ycbcr_image_avx2 {
2729
#[target_feature(enable = "avx2")]
2830
fn load3(data: &[u8]) -> __m256i {
2931
_ = data[7 * $num_colors]; // dummy indexing operation up front to avoid bounds checks later
30-
_mm256_set_epi32(
32+
_mm256_setr_epi32(
3133
data[0] as i32,
3234
data[1 * $num_colors] as i32,
3335
data[2 * $num_colors] as i32,
@@ -41,13 +43,25 @@ macro_rules! ycbcr_image_avx2 {
4143

4244
#[inline]
4345
#[target_feature(enable = "avx2")]
44-
fn avx_as_i32_array(data: __m256i) -> [i32; 8] {
45-
// Safety preconditions. Optimized away in release mode, no runtime cost.
46-
assert!(core::mem::size_of::<__m256i>() == core::mem::size_of::<[i32; 8]>());
47-
assert!(core::mem::align_of::<__m256i>() >= core::mem::align_of::<[i32; 8]>());
48-
// SAFETY: size and alignment preconditions checked above.
49-
// Both types are plain old data: no pointers, lifetimes, etc.
50-
unsafe { core::mem::transmute(data) }
46+
fn avx_as_u8_array(data: __m256i) -> [u8; 8] {
47+
// Select the third byte of every i32, pack the bytes into
48+
// the low i32 byte of each 128-bit lane
49+
let u8_shuffle_mask = _mm256_setr_epi8(
50+
2, 6, 10, 14, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 2, 6, 10, 14,
51+
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
52+
);
53+
// Shuffle the lowest i32 lane from the high 128-bit lane
54+
// into the second i32
55+
let u32_permute_mask = _mm256_setr_epi32(0, 4, 0, 0, 0, 0, 0, 0);
56+
57+
// ABCD EFGH IJKL MNOP becomes CGKO .... .... ...., for both lanes
58+
let a = _mm256_shuffle_epi8(data, u8_shuffle_mask);
59+
// A... B... becomes AB.. ....
60+
let a = _mm256_permutevar8x32_epi32(a, u32_permute_mask);
61+
62+
// No-op cast to __m128i and extract the lower 64-bits
63+
let out = _mm256_castsi256_si128(a);
64+
_mm_cvtsi128_si64(out).to_le_bytes()
5165
}
5266

5367
let [y_buffer, cb_buffer, cr_buffer, _] = buffers;
@@ -82,11 +96,7 @@ macro_rules! ycbcr_image_avx2 {
8296

8397
let y = _mm256_add_epi32(_mm256_add_epi32(yr, yg), yb);
8498
let y = _mm256_add_epi32(y, _mm256_set1_epi32(0x7FFF));
85-
let y = _mm256_srli_epi32(y, 16);
86-
let y: [i32; 8] = avx_as_i32_array(y);
87-
let mut y: [u8; 8] = y.map(|x| x as u8);
88-
y.reverse();
89-
y_buffer.extend_from_slice(&y);
99+
y_buffer.extend(avx_as_u8_array(y));
90100

91101
let cbr = _mm256_mullo_epi32(cbmulr, r);
92102
let cbg = _mm256_mullo_epi32(cbmulg, g);
@@ -95,11 +105,7 @@ macro_rules! ycbcr_image_avx2 {
95105
let cb = _mm256_add_epi32(_mm256_sub_epi32(cbr, cbg), cbb);
96106
let cb = _mm256_add_epi32(cb, _mm256_set1_epi32(128 << 16));
97107
let cb = _mm256_add_epi32(cb, _mm256_set1_epi32(0x7FFF));
98-
let cb = _mm256_srli_epi32(cb, 16);
99-
let cb: [i32; 8] = avx_as_i32_array(cb);
100-
let mut cb: [u8; 8] = cb.map(|x| x as u8);
101-
cb.reverse();
102-
cb_buffer.extend_from_slice(&cb);
108+
cb_buffer.extend(avx_as_u8_array(cb));
103109

104110
let crr = _mm256_mullo_epi32(crmulr, r);
105111
let crg = _mm256_mullo_epi32(crmulg, g);
@@ -108,11 +114,7 @@ macro_rules! ycbcr_image_avx2 {
108114
let cr = _mm256_sub_epi32(_mm256_sub_epi32(crr, crg), crb);
109115
let cr = _mm256_add_epi32(cr, _mm256_set1_epi32(128 << 16));
110116
let cr = _mm256_add_epi32(cr, _mm256_set1_epi32(0x7FFF));
111-
let cr = _mm256_srli_epi32(cr, 16);
112-
let cr: [i32; 8] = avx_as_i32_array(cr);
113-
let mut cr: [u8; 8] = cr.map(|x| x as u8);
114-
cr.reverse();
115-
cr_buffer.extend_from_slice(&cr);
117+
cr_buffer.extend(avx_as_u8_array(cr));
116118
}
117119

118120
for _ in 0..self.width() % 8 {

0 commit comments

Comments
 (0)