55#include " rgb1010102.h"
66#include < vector>
77
8+ #if HAVE_NEON
9+
10+ #include < arm_neon.h>
11+
12+ void convertRGBA1010102ToU8_NEON (const uint8_t *src, int srcStride, uint8_t *dst, int dstStride,
13+ int width, int height) {
14+ uint32x4_t mask = vdupq_n_u32 (0x3FF ); // Create a vector with 10 bits set
15+ auto maxColors = vdupq_n_u32 (255 );
16+ auto minColors = vdupq_n_u32 (0 );
17+
18+ uint32_t testValue = 0x01020304 ;
19+ auto testBytes = reinterpret_cast <uint8_t *>(&testValue);
20+
21+ bool littleEndian = false ;
22+ if (testBytes[0 ] == 0x04 ) {
23+ littleEndian = true ;
24+ } else if (testBytes[0 ] == 0x01 ) {
25+ littleEndian = false ;
26+ }
27+
28+ auto dstPtr = reinterpret_cast <uint8_t *>(dst);
29+
30+ auto m8Bit = vdupq_n_u32 (255 );
31+ const uint32_t scalarMask = (1u << 10u ) - 1u ;
32+
33+ for (int y = 0 ; y < height; ++y) {
34+ const uint8_t *srcPointer = src;
35+ uint8_t *dstPointer = dstPtr;
36+ int x;
37+ for (x = 0 ; x + 2 < width; x += 4 ) {
38+ uint32x4_t rgba1010102 = vld1q_u32 (reinterpret_cast <const uint32_t *>(srcPointer));
39+
40+ auto originalR = vshrq_n_u32 (
41+ vmulq_u32 (vandq_u32 (vshrq_n_u32 (rgba1010102, 20 ), mask), m8Bit), 10 );
42+ uint32x4_t r = vmaxq_u32 (vminq_u32 (originalR, maxColors), minColors);
43+ auto originalG = vshrq_n_u32 (
44+ vmulq_u32 (vandq_u32 (vshrq_n_u32 (rgba1010102, 10 ), mask), m8Bit), 10 );
45+ uint32x4_t g = vmaxq_u32 (vminq_u32 (originalG, maxColors), minColors);
46+ auto originalB = vshrq_n_u32 (vmulq_u32 (vandq_u32 (rgba1010102, mask), m8Bit), 10 );
47+ uint32x4_t b = vmaxq_u32 (vminq_u32 (originalB, maxColors), minColors);
48+
49+ uint32x4_t a1 = vshrq_n_u32 (rgba1010102, 30 );
50+
51+ uint32x4_t aq = vorrq_u32 (vshlq_n_u32 (a1, 8 ), vorrq_u32 (vshlq_n_u32 (a1, 6 ),
52+ vorrq_u32 (vshlq_n_u32 (a1, 4 ),
53+ vorrq_u32 (
54+ vshlq_n_u32 (
55+ a1,
56+ 2 ),
57+ a1))));
58+ uint32x4_t a = vshrq_n_u32 (vmulq_u32 (aq, m8Bit), 10 );
59+
60+ uint8x8_t rUInt8 = vqmovn_u16 (vqmovn_high_u32 (vqmovn_u32 (r), r));
61+ uint8x8_t gUInt8 = vqmovn_u16 (vqmovn_high_u32 (vqmovn_u32 (g), g));
62+ uint8x8_t bUInt8 = vqmovn_u16 (vqmovn_high_u32 (vqmovn_u32 (b), b));
63+ uint8x8_t aUInt8 = vqmovn_u16 (vqmovn_high_u32 (vqmovn_u32 (a), a));
64+
65+ uint8x8x4_t resultRGBA;
66+ // Interleave the channels to get RGBA format
67+ resultRGBA.val [0 ] = vzip_u8 (bUInt8, bUInt8).val [0 ];
68+ resultRGBA.val [1 ] = vzip_u8 (gUInt8 , gUInt8 ).val [0 ];
69+ resultRGBA.val [2 ] = vzip_u8 (rUInt8, rUInt8).val [1 ];
70+ resultRGBA.val [3 ] = vzip_u8 (aUInt8, aUInt8).val [1 ];
71+
72+ vst4_u8 (reinterpret_cast <uint8_t *>(dstPointer), resultRGBA);
73+
74+ srcPointer += 16 ;
75+ dstPointer += 16 ;
76+ }
77+
78+ for (; x < width; ++x) {
79+ uint32_t rgba1010102 = reinterpret_cast <const uint32_t *>(srcPointer)[0 ];
80+
81+ uint32_t b = (rgba1010102) & scalarMask;
82+ uint32_t g = (rgba1010102 >> 10 ) & scalarMask;
83+ uint32_t r = (rgba1010102 >> 20 ) & scalarMask;
84+
85+ uint32_t a1 = (rgba1010102 >> 30 );
86+ uint32_t a = (a1 << 8 ) | (a1 << 6 ) | (a1 << 4 ) | (a1 << 2 ) | a1;
87+
88+ // Convert each channel to floating-point values
89+ auto rUInt16 = static_cast <uint16_t >(r);
90+ auto gUInt16 = static_cast <uint16_t >(g);
91+ auto bUInt16 = static_cast <uint16_t >(b);
92+ auto aUInt16 = static_cast <uint16_t >(a);
93+
94+ auto dstCast = reinterpret_cast <uint16_t *>(dstPointer);
95+
96+ if (littleEndian) {
97+ dstCast[0 ] = bUInt16;
98+ dstCast[1 ] = gUInt16 ;
99+ dstCast[2 ] = rUInt16;
100+ dstCast[3 ] = aUInt16;
101+ } else {
102+ dstCast[0 ] = rUInt16;
103+ dstCast[1 ] = gUInt16 ;
104+ dstCast[2 ] = bUInt16;
105+ dstCast[3 ] = aUInt16;
106+ }
107+
108+ srcPointer += 4 ;
109+ dstPointer += 4 ;
110+ }
111+
112+ src += srcStride;
113+ dstPtr += dstStride;
114+ }
115+ }
116+
117+ void convertRGBA1010102ToU16_NEON (const uint8_t *src, int srcStride, uint16_t *dst, int dstStride,
118+ int width, int height) {
119+ uint32x4_t mask = vdupq_n_u32 (0x3FF ); // Create a vector with 10 bits set
120+ auto maxColors = vdupq_n_u32 (1023 );
121+ auto minColors = vdupq_n_u32 (0 );
122+
123+ uint32_t testValue = 0x01020304 ;
124+ auto testBytes = reinterpret_cast <uint8_t *>(&testValue);
125+
126+ bool littleEndian = false ;
127+ if (testBytes[0 ] == 0x04 ) {
128+ littleEndian = true ;
129+ } else if (testBytes[0 ] == 0x01 ) {
130+ littleEndian = false ;
131+ }
132+
133+ const uint32_t scalarMask = (1u << 10u ) - 1u ;
134+
135+ auto dstPtr = reinterpret_cast <uint8_t *>(dst);
136+
137+ for (int y = 0 ; y < height; ++y) {
138+ const uint8_t *srcPointer = src;
139+ uint8_t *dstPointer = dstPtr;
140+ int x;
141+ for (x = 0 ; x + 2 < width; x += 2 ) {
142+ uint32x4_t rgba1010102 = vld1q_u32 (reinterpret_cast <const uint32_t *>(srcPointer));
143+
144+ uint32x4_t r = vmaxq_u32 (
145+ vminq_u32 (vandq_u32 (vshrq_n_u32 (rgba1010102, 20 ), mask), maxColors),
146+ minColors);
147+ uint32x4_t g = vmaxq_u32 (
148+ vminq_u32 (vandq_u32 (vshrq_n_u32 (rgba1010102, 10 ), mask), maxColors),
149+ minColors);
150+ uint32x4_t b = vmaxq_u32 (vminq_u32 (vandq_u32 (rgba1010102, mask), maxColors),
151+ minColors);
152+
153+ uint32x4_t a1 = vshrq_n_u32 (rgba1010102, 30 );
154+
155+ uint32x4_t a = vorrq_u32 (vshlq_n_u32 (a1, 8 ), vorrq_u32 (vshlq_n_u32 (a1, 6 ),
156+ vorrq_u32 (vshlq_n_u32 (a1, 4 ),
157+ vorrq_u32 (
158+ vshlq_n_u32 (a1,
159+ 2 ),
160+ a1))));
161+
162+ uint16x4_t rUInt16 = vqmovn_u32 (b);
163+ uint16x4_t gUInt16 = vqmovn_u32 (g);
164+ uint16x4_t bUInt16 = vqmovn_u32 (r);
165+ uint16x4_t aUInt16 = vqmovn_u32 (vmaxq_u32 (vminq_u32 (a, maxColors), minColors));
166+
167+ uint16x4x4_t interleavedRGBA;
168+ interleavedRGBA.val [0 ] = rUInt16;
169+ interleavedRGBA.val [1 ] = gUInt16 ;
170+ interleavedRGBA.val [2 ] = bUInt16;
171+ interleavedRGBA.val [3 ] = aUInt16;
172+
173+ vst4_u16 (reinterpret_cast <uint16_t *>(dstPointer), interleavedRGBA);
174+
175+ srcPointer += 8 ;
176+ dstPointer += 16 ;
177+ }
178+
179+ for (; x < width; ++x) {
180+ uint32_t rgba1010102 = reinterpret_cast <const uint32_t *>(srcPointer)[0 ];
181+ uint32_t b = (rgba1010102) & scalarMask;
182+ uint32_t g = (rgba1010102 >> 10 ) & scalarMask;
183+ uint32_t r = (rgba1010102 >> 20 ) & scalarMask;
184+
185+ uint32_t a1 = (rgba1010102 >> 30 );
186+ uint32_t a = (a1 << 8 ) | (a1 << 6 ) | (a1 << 4 ) | (a1 << 2 ) | a1;
187+
188+ // Convert each channel to floating-point values
189+ auto rFloat = std::max (std::min (static_cast <uint8_t >((r * 255 ) / 1023 ), (uint8_t ) 255 ),
190+ (uint8_t ) 0 );
191+ auto gFloat = std::max (std::min (static_cast <uint8_t >((g * 255 ) / 1023 ), (uint8_t ) 255 ),
192+ (uint8_t ) 0 );
193+ auto bFloat = std::max (std::min (static_cast <uint8_t >((b * 255 ) / 1023 ), (uint8_t ) 255 ),
194+ (uint8_t ) 0 );
195+ auto aFloat = std::max (std::min (static_cast <uint8_t >((a * 255 ) / 1023 ), (uint8_t ) 255 ),
196+ (uint8_t ) 0 );
197+
198+ auto dstCast = reinterpret_cast <uint8_t *>(dstPointer);
199+ if (littleEndian) {
200+ dstCast[0 ] = bFloat;
201+ dstCast[1 ] = gFloat ;
202+ dstCast[2 ] = rFloat;
203+ dstCast[3 ] = aFloat;
204+ } else {
205+ dstCast[0 ] = rFloat;
206+ dstCast[1 ] = gFloat ;
207+ dstCast[2 ] = bFloat;
208+ dstCast[3 ] = aFloat;
209+ }
210+
211+ srcPointer += 4 ;
212+ dstPointer += 4 ;
213+ }
214+
215+
216+ src += srcStride;
217+ dstPtr += dstStride;
218+ }
219+ }
220+
221+ #endif
222+
8223void convertRGBA1010102ToU16_C (const uint8_t *src, int srcStride, uint16_t *dst, int dstStride,
9224 int width, int height) {
10225 auto mDstPointer = reinterpret_cast <uint8_t *>(dst);
@@ -37,22 +252,23 @@ void convertRGBA1010102ToU16_C(const uint8_t *src, int srcStride, uint16_t *dst,
37252 uint32_t a = (a1 << 8 ) | (a1 << 6 ) | (a1 << 4 ) | (a1 << 2 ) | a1;
38253
39254 // Convert each channel to floating-point values
40- auto rFloat = static_cast <uint16_t >(r);
41- auto gFloat = static_cast <uint16_t >(g);
42- auto bFloat = static_cast <uint16_t >(b);
43- auto aFloat = static_cast <uint16_t >(a);
255+ auto rUInt16 = static_cast <uint16_t >(r);
256+ auto gUInt16 = static_cast <uint16_t >(g);
257+ auto bUInt16 = static_cast <uint16_t >(b);
258+ auto aUInt16 = static_cast <uint16_t >(a);
44259
45260 auto dstCast = reinterpret_cast <uint16_t *>(dstPointer);
261+
46262 if (littleEndian) {
47- dstCast[0 ] = bFloat ;
48- dstCast[1 ] = gFloat ;
49- dstCast[2 ] = rFloat ;
50- dstCast[3 ] = aFloat ;
263+ dstCast[0 ] = bUInt16 ;
264+ dstCast[1 ] = gUInt16 ;
265+ dstCast[2 ] = rUInt16 ;
266+ dstCast[3 ] = aUInt16 ;
51267 } else {
52- dstCast[0 ] = rFloat ;
53- dstCast[1 ] = gFloat ;
54- dstCast[2 ] = bFloat ;
55- dstCast[3 ] = aFloat ;
268+ dstCast[0 ] = rUInt16 ;
269+ dstCast[1 ] = gUInt16 ;
270+ dstCast[2 ] = bUInt16 ;
271+ dstCast[3 ] = aUInt16 ;
56272 }
57273
58274 srcPointer += 4 ;
@@ -79,6 +295,8 @@ void convertRGBA1010102ToU8_C(const uint8_t *src, int srcStride, uint8_t *dst, i
79295 littleEndian = false ;
80296 }
81297
298+ const uint32_t mask = (1u << 10u ) - 1u ;
299+
82300 for (int y = 0 ; y < height; ++y) {
83301
84302 auto dstPointer = reinterpret_cast <uint8_t *>(mDstPointer );
@@ -87,7 +305,6 @@ void convertRGBA1010102ToU8_C(const uint8_t *src, int srcStride, uint8_t *dst, i
87305 for (int x = 0 ; x < width; ++x) {
88306 uint32_t rgba1010102 = reinterpret_cast <const uint32_t *>(srcPointer)[0 ];
89307
90- const uint32_t mask = (1u << 10u ) - 1u ;
91308 uint32_t b = (rgba1010102) & mask;
92309 uint32_t g = (rgba1010102 >> 10 ) & mask;
93310 uint32_t r = (rgba1010102 >> 20 ) & mask;
@@ -129,10 +346,18 @@ void convertRGBA1010102ToU8_C(const uint8_t *src, int srcStride, uint8_t *dst, i
129346
130347void RGBA1010102ToU8 (const uint8_t *src, int srcStride, uint8_t *dst, int dstStride,
131348 int width, int height) {
349+ #if HAVE_NEON
350+ convertRGBA1010102ToU8_NEON (src, srcStride, dst, dstStride, width, height);
351+ #else
132352 convertRGBA1010102ToU8_C (src, srcStride, dst, dstStride, width, height);
353+ #endif
133354}
134355
135356void RGBA1010102ToU16 (const uint8_t *src, int srcStride, uint16_t *dst, int dstStride,
136357 int width, int height) {
358+ #if HAVE_NEON
359+ convertRGBA1010102ToU16_NEON (src, srcStride, dst, dstStride, width, height);
360+ #else
137361 convertRGBA1010102ToU16_C (src, srcStride, dst, dstStride, width, height);
362+ #endif
138363}
0 commit comments