1+ //
2+ // Created by Radzivon Bartoshyk on 12/09/2023.
3+ //
4+
5+ #include " CopyUnalignedRGBA.h"
6+ #include " ThreadPool.hpp"
7+ #include < cstdint>
8+
9+ #undef HWY_TARGET_INCLUDE
10+ #define HWY_TARGET_INCLUDE " CopyUnalignedRGBA.cpp"
11+ #include " hwy/foreach_target.h"
12+ #include " hwy/highway.h"
13+
14+ HWY_BEFORE_NAMESPACE ();
15+
16+ namespace coder {
17+ namespace HWY_NAMESPACE {
18+
19+ using hwy::HWY_NAMESPACE::ScalableTag;
20+ using hwy::HWY_NAMESPACE::Store;
21+ using hwy::HWY_NAMESPACE::Load;
22+ using hwy::HWY_NAMESPACE::Vec;
23+ using hwy::HWY_NAMESPACE::TFromD;
24+
25+ template <class D , typename T = TFromD<D>>
26+ void
27+ CopyUnalignedRGBARow (const D d, const T *HWY_RESTRICT src, T *HWY_RESTRICT dst, int width) {
28+ int x = 0 ;
29+ using VU = Vec<decltype (d)>;
30+ int pixels = d.MaxLanes () / 4 ;
31+ for (x = 0 ; x + pixels < width; x += pixels) {
32+ VU pixel = Load (d, src);
33+ Store (pixel, d, dst);
34+
35+ src += pixels * 4 ;
36+ dst += pixels * 4 ;
37+ }
38+
39+ for (; x < width; ++x) {
40+ auto p1 = src[0 ];
41+ auto p2 = src[1 ];
42+ auto p3 = src[2 ];
43+ auto p4 = src[3 ];
44+
45+ dst[0 ] = p1;
46+ dst[1 ] = p2;
47+ dst[2 ] = p3;
48+ dst[3 ] = p4;
49+
50+ src += 4 ;
51+ dst += 4 ;
52+ }
53+ }
54+
55+ void
56+ CopyUnalignedRGBA (const uint8_t *HWY_RESTRICT src, int srcStride, uint8_t *HWY_RESTRICT dst,
57+ int dstStride, int width,
58+ int height,
59+ int pixelSize) {
60+ ThreadPool pool;
61+ std::vector<std::future<void >> results;
62+
63+ for (int y = 0 ; y < height; y++) {
64+ if (pixelSize == 1 ) {
65+ const ScalableTag<uint8_t > du8;
66+ auto fn = CopyUnalignedRGBARow<decltype (du8)>;
67+ auto r = pool.enqueue (fn,
68+ du8,
69+ reinterpret_cast <const uint8_t *>(src + (y * srcStride)),
70+ reinterpret_cast <uint8_t *>(dst + (y * dstStride)),
71+ width);
72+ results.push_back (std::move (r));
73+ } else if (pixelSize == 2 ) {
74+ const ScalableTag<uint16_t > du16;
75+ auto fn = CopyUnalignedRGBARow<decltype (du16)>;
76+ auto r = pool.enqueue (fn,
77+ du16,
78+ reinterpret_cast <const uint16_t *>(src + (y * srcStride)),
79+ reinterpret_cast <uint16_t *>(dst + (y * dstStride)),
80+ width);
81+ results.push_back (std::move (r));
82+ } else if (pixelSize == 4 ) {
83+ const ScalableTag<float > df32;
84+ auto fn = CopyUnalignedRGBARow<decltype (df32)>;
85+ auto r = pool.enqueue (fn,
86+ df32,
87+ reinterpret_cast <const float *>(src + (y * srcStride)),
88+ reinterpret_cast <float *>(dst + (y * dstStride)),
89+ width);
90+ results.push_back (std::move (r));
91+ }
92+ }
93+
94+ for (auto &result: results) {
95+ result.wait ();
96+ }
97+ }
98+
99+ }
100+ }
101+
102+ HWY_AFTER_NAMESPACE ();
103+
104+ #if HWY_ONCE
105+ namespace coder {
106+ HWY_EXPORT (CopyUnalignedRGBA);
107+
108+ HWY_DLLEXPORT void
109+ CopyUnalignedRGBA (const uint8_t *HWY_RESTRICT src, int srcStride, uint8_t *HWY_RESTRICT dst,
110+ int dstStride, int width,
111+ int height,
112+ int pixelSize) {
113+ HWY_DYNAMIC_DISPATCH (CopyUnalignedRGBA)(src, srcStride, dst, dstStride, width, height,
114+ pixelSize);
115+ }
116+ }
117+ #endif
0 commit comments