@@ -46,44 +46,38 @@ namespace coder::HWY_NAMESPACE {
4646 using hwy::HWY_NAMESPACE::ScalableTag;
4747 using hwy::HWY_NAMESPACE::StoreU;
4848 using hwy::HWY_NAMESPACE::LoadU;
49+ using hwy::HWY_NAMESPACE::FixedTag;
4950 using hwy::HWY_NAMESPACE::Vec;
5051 using hwy::HWY_NAMESPACE::LoadInterleaved4;
5152 using hwy::HWY_NAMESPACE::StoreInterleaved4;
5253
5354 template <class D , typename Buf, typename T = Vec<D>>
5455 void
55- CopyUnalignedRGBARow (const D d, const Buf *HWY_RESTRICT src, Buf *HWY_RESTRICT dst, int width) {
56+ Copy1Row (const D d, const Buf *HWY_RESTRICT src, Buf *HWY_RESTRICT dst, int width) {
5657 int x = 0 ;
5758 using VU = Vec<decltype (d)>;
5859 int pixels = d.MaxLanes ();
5960 for (; x + pixels < width; x += pixels) {
60- VU a, r, g, b;
61- LoadInterleaved4 (d, src, a, r, g, b);
62- StoreInterleaved4 (a, r, g, b, d, dst);
61+ VU a = LoadU (d, src);
62+ StoreU (a, d, reinterpret_cast <Buf *>(dst));
6363
64- src += pixels * 4 ;
65- dst += pixels * 4 ;
64+ src += pixels;
65+ dst += pixels;
6666 }
6767
6868 for (; x < width; ++x) {
6969 auto p1 = src[0 ];
70- auto p2 = src[1 ];
71- auto p3 = src[2 ];
72- auto p4 = src[3 ];
73-
7470 dst[0 ] = p1;
75- dst[1 ] = p2;
76- dst[2 ] = p3;
77- dst[3 ] = p4;
7871
79- src += 4 ;
80- dst += 4 ;
72+ src += 1 ;
73+ dst += 1 ;
8174 }
8275 }
8376
8477 void
8578 CopyUnalignedRGBA (const uint8_t *HWY_RESTRICT src, int srcStride, uint8_t *HWY_RESTRICT dst,
86- int dstStride, int width,
79+ int dstStride,
80+ int width,
8781 int height,
8882 int pixelSize) {
8983 int threadCount = clamp (min (static_cast <int >(std::thread::hardware_concurrency ()),
@@ -102,7 +96,7 @@ namespace coder::HWY_NAMESPACE {
10296 for (int y = start; y < end; ++y) {
10397 if (pixelSize == 1 ) {
10498 const ScalableTag<uint8_t > du8;
105- auto fn = CopyUnalignedRGBARow <decltype (du8), uint8_t >;
99+ auto fn = Copy1Row <decltype (du8), uint8_t >;
106100 fn (du8,
107101 reinterpret_cast <const uint8_t *>(
108102 reinterpret_cast <const uint8_t *>(src) + (y * srcStride)),
@@ -111,7 +105,7 @@ namespace coder::HWY_NAMESPACE {
111105 width);
112106 } else if (pixelSize == 2 ) {
113107 const ScalableTag<uint16_t > du16;
114- auto fn = CopyUnalignedRGBARow <decltype (du16), uint16_t >;
108+ auto fn = Copy1Row <decltype (du16), uint16_t >;
115109 fn (du16,
116110 reinterpret_cast <const uint16_t *>(
117111 reinterpret_cast <const uint8_t *>(src) +
@@ -121,14 +115,12 @@ namespace coder::HWY_NAMESPACE {
121115 width);
122116 } else if (pixelSize == 4 ) {
123117 const ScalableTag<uint32_t > df32;
124- auto fn = CopyUnalignedRGBARow <decltype (df32), uint32_t >;
118+ auto fn = Copy1Row <decltype (df32), uint32_t >;
125119 fn (df32,
126120 reinterpret_cast <const uint32_t *>(
127- reinterpret_cast <const uint8_t *>(src) +
128- (y * srcStride)),
121+ reinterpret_cast <const uint8_t *>(src) + (y * srcStride)),
129122 reinterpret_cast <uint32_t *>(reinterpret_cast <uint8_t *>(dst) +
130- (y * dstStride)),
131- width);
123+ (y * dstStride)), width);
132124 }
133125 }
134126 });
@@ -138,82 +130,22 @@ namespace coder::HWY_NAMESPACE {
138130 thread.join ();
139131 }
140132 }
141-
142- void CopyUnalignedRGB565Row (const uint16_t *HWY_RESTRICT src, uint16_t *HWY_RESTRICT dst,
143- int width) {
144- int x = 0 ;
145- const ScalableTag<uint16_t > du;
146- using VU = Vec<decltype (du)>;
147- int pixels = du.MaxLanes ();
148- for (; x + pixels < width; x += pixels) {
149- VU pixel = LoadU (du, src);
150- StoreU (pixel, du, dst);
151-
152- src += pixels;
153- dst += pixels;
154- }
155-
156- for (; x < width; ++x) {
157- auto p1 = src[0 ];
158- dst[0 ] = p1;
159- src += 1 ;
160- dst += 1 ;
161- }
162- }
163-
164- void
165- CopyUnalignedRGB565 (const uint8_t *HWY_RESTRICT src, int srcStride, uint8_t *HWY_RESTRICT dst,
166- int dstStride, int width,
167- int height) {
168- int threadCount = clamp (min (static_cast <int >(std::thread::hardware_concurrency ()),
169- height * width / (356 * 356 )), 1 , 12 );
170- vector<thread> workers;
171- int segmentHeight = height / threadCount;
172-
173- for (int i = 0 ; i < threadCount; i++) {
174- int start = i * segmentHeight;
175- int end = (i + 1 ) * segmentHeight;
176- if (i == threadCount - 1 ) {
177- end = height;
178- }
179- workers.emplace_back ([start, end, src, dstStride, dst, srcStride, width]() {
180- for (int y = start; y < end; ++y) {
181- CopyUnalignedRGB565Row (
182- reinterpret_cast <const uint16_t *>(src + (y * srcStride)),
183- reinterpret_cast <uint16_t *>(dst + (y * dstStride)),
184- width);
185- }
186- });
187- }
188-
189- for (std::thread &thread: workers) {
190- thread.join ();
191- }
192- }
193-
194133}
195134
196135HWY_AFTER_NAMESPACE ();
197136
198137#if HWY_ONCE
199138namespace coder {
200139 HWY_EXPORT (CopyUnalignedRGBA);
201- HWY_EXPORT (CopyUnalignedRGB565);
202140
203141 HWY_DLLEXPORT void
204- CopyUnalignedRGBA (const uint8_t *HWY_RESTRICT src, int srcStride, uint8_t *HWY_RESTRICT dst,
205- int dstStride, int width,
206- int height,
207- int pixelSize) {
142+ CopyUnaligned (const uint8_t *HWY_RESTRICT src, int srcStride, uint8_t *HWY_RESTRICT dst,
143+ int dstStride, int width,
144+ int height,
145+ int pixelSize) {
208146 HWY_DYNAMIC_DISPATCH (CopyUnalignedRGBA)(src, srcStride, dst, dstStride, width, height,
209147 pixelSize);
210148 }
211149
212- HWY_DLLEXPORT void
213- CopyUnalignedRGB565 (const uint8_t *HWY_RESTRICT src, int srcStride, uint8_t *HWY_RESTRICT dst,
214- int dstStride, int width,
215- int height) {
216- HWY_DYNAMIC_DISPATCH (CopyUnalignedRGB565)(src, srcStride, dst, dstStride, width, height);
217- }
218150}
219151#endif
0 commit comments