11#[ cfg( target_arch = "x86" ) ]
22use core:: arch:: x86:: {
3- __m256i, _mm256_add_epi32, _mm256_mullo_epi32, _mm256_set1_epi32, _mm256_set_epi32,
4- _mm256_srli_epi32, _mm256_sub_epi32,
3+ __m256i, _mm256_add_epi32, _mm256_castsi256_si128, _mm256_mullo_epi32,
4+ _mm256_permutevar8x32_epi32, _mm256_set1_epi32, _mm256_setr_epi32, _mm256_setr_epi8,
5+ _mm256_shuffle_epi8, _mm256_sub_epi32, _mm_cvtsi128_si64,
56} ;
67
78#[ cfg( target_arch = "x86_64" ) ]
89use core:: arch:: x86_64:: {
9- __m256i, _mm256_add_epi32, _mm256_mullo_epi32, _mm256_set1_epi32, _mm256_set_epi32,
10- _mm256_srli_epi32, _mm256_sub_epi32,
10+ __m256i, _mm256_add_epi32, _mm256_castsi256_si128, _mm256_mullo_epi32,
11+ _mm256_permutevar8x32_epi32, _mm256_set1_epi32, _mm256_setr_epi32, _mm256_setr_epi8,
12+ _mm256_shuffle_epi8, _mm256_sub_epi32, _mm_cvtsi128_si64,
1113} ;
1214
1315use alloc:: vec:: Vec ;
@@ -27,7 +29,7 @@ macro_rules! ycbcr_image_avx2 {
2729 #[ target_feature( enable = "avx2" ) ]
2830 fn load3( data: & [ u8 ] ) -> __m256i {
2931 _ = data[ 7 * $num_colors] ; // dummy indexing operation up front to avoid bounds checks later
30- _mm256_set_epi32 (
32+ _mm256_setr_epi32 (
3133 data[ 0 ] as i32 ,
3234 data[ 1 * $num_colors] as i32 ,
3335 data[ 2 * $num_colors] as i32 ,
@@ -41,13 +43,25 @@ macro_rules! ycbcr_image_avx2 {
4143
4244 #[ inline]
4345 #[ target_feature( enable = "avx2" ) ]
44- fn avx_as_i32_array( data: __m256i) -> [ i32 ; 8 ] {
45- // Safety preconditions. Optimized away in release mode, no runtime cost.
46- assert!( core:: mem:: size_of:: <__m256i>( ) == core:: mem:: size_of:: <[ i32 ; 8 ] >( ) ) ;
47- assert!( core:: mem:: align_of:: <__m256i>( ) >= core:: mem:: align_of:: <[ i32 ; 8 ] >( ) ) ;
48- // SAFETY: size and alignment preconditions checked above.
49- // Both types are plain old data: no pointers, lifetimes, etc.
50- unsafe { core:: mem:: transmute( data) }
46+ fn avx_as_u8_array( data: __m256i) -> [ u8 ; 8 ] {
47+ // Select the third byte of every i32, pack the bytes into
48+ // the low i32 byte of each 128-bit lane
49+ let u8_shuffle_mask = _mm256_setr_epi8(
50+ 2 , 6 , 10 , 14 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , 2 , 6 , 10 , 14 ,
51+ -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 ,
52+ ) ;
53+ // Shuffle the lowest i32 lane from the high 128-bit lane
54+ // into the second i32
55+ let u32_permute_mask = _mm256_setr_epi32( 0 , 4 , 0 , 0 , 0 , 0 , 0 , 0 ) ;
56+
57+ // ABCD EFGH IJKL MNOP becomes CGKO .... .... ...., for both lanes
58+ let a = _mm256_shuffle_epi8( data, u8_shuffle_mask) ;
59+ // A... B... becomes AB.. ....
60+ let a = _mm256_permutevar8x32_epi32( a, u32_permute_mask) ;
61+
62+ // No-op cast to __m128i and extract the lower 64-bits
63+ let out = _mm256_castsi256_si128( a) ;
64+ _mm_cvtsi128_si64( out) . to_le_bytes( )
5165 }
5266
5367 let [ y_buffer, cb_buffer, cr_buffer, _] = buffers;
@@ -82,11 +96,7 @@ macro_rules! ycbcr_image_avx2 {
8296
8397 let y = _mm256_add_epi32( _mm256_add_epi32( yr, yg) , yb) ;
8498 let y = _mm256_add_epi32( y, _mm256_set1_epi32( 0x7FFF ) ) ;
85- let y = _mm256_srli_epi32( y, 16 ) ;
86- let y: [ i32 ; 8 ] = avx_as_i32_array( y) ;
87- let mut y: [ u8 ; 8 ] = y. map( |x| x as u8 ) ;
88- y. reverse( ) ;
89- y_buffer. extend_from_slice( & y) ;
99+ y_buffer. extend( avx_as_u8_array( y) ) ;
90100
91101 let cbr = _mm256_mullo_epi32( cbmulr, r) ;
92102 let cbg = _mm256_mullo_epi32( cbmulg, g) ;
@@ -95,11 +105,7 @@ macro_rules! ycbcr_image_avx2 {
95105 let cb = _mm256_add_epi32( _mm256_sub_epi32( cbr, cbg) , cbb) ;
96106 let cb = _mm256_add_epi32( cb, _mm256_set1_epi32( 128 << 16 ) ) ;
97107 let cb = _mm256_add_epi32( cb, _mm256_set1_epi32( 0x7FFF ) ) ;
98- let cb = _mm256_srli_epi32( cb, 16 ) ;
99- let cb: [ i32 ; 8 ] = avx_as_i32_array( cb) ;
100- let mut cb: [ u8 ; 8 ] = cb. map( |x| x as u8 ) ;
101- cb. reverse( ) ;
102- cb_buffer. extend_from_slice( & cb) ;
108+ cb_buffer. extend( avx_as_u8_array( cb) ) ;
103109
104110 let crr = _mm256_mullo_epi32( crmulr, r) ;
105111 let crg = _mm256_mullo_epi32( crmulg, g) ;
@@ -108,11 +114,7 @@ macro_rules! ycbcr_image_avx2 {
108114 let cr = _mm256_sub_epi32( _mm256_sub_epi32( crr, crg) , crb) ;
109115 let cr = _mm256_add_epi32( cr, _mm256_set1_epi32( 128 << 16 ) ) ;
110116 let cr = _mm256_add_epi32( cr, _mm256_set1_epi32( 0x7FFF ) ) ;
111- let cr = _mm256_srli_epi32( cr, 16 ) ;
112- let cr: [ i32 ; 8 ] = avx_as_i32_array( cr) ;
113- let mut cr: [ u8 ; 8 ] = cr. map( |x| x as u8 ) ;
114- cr. reverse( ) ;
115- cr_buffer. extend_from_slice( & cr) ;
117+ cr_buffer. extend( avx_as_u8_array( cr) ) ;
116118 }
117119
118120 for _ in 0 ..self . width( ) % 8 {
0 commit comments