1+ //
2+ // Created by Radzivon Bartoshyk on 15/09/2023.
3+ //
4+
5+ #include " HLG.h"
6+ #include " ThreadPool.hpp"
7+ #include " HalfFloats.h"
8+
9+ #undef HWY_TARGET_INCLUDE
10+ #define HWY_TARGET_INCLUDE " HLG.cpp"
11+
12+ #include " hwy/foreach_target.h"
13+ #include " hwy/highway.h"
14+
15+ HWY_BEFORE_NAMESPACE ();
16+
17+ namespace coder {
18+ namespace HWY_NAMESPACE {
19+
20+ using hwy::HWY_NAMESPACE::Set;
21+ using hwy::HWY_NAMESPACE::FixedTag;
22+ using hwy::HWY_NAMESPACE::Vec;
23+ using hwy::HWY_NAMESPACE::Add;
24+ using hwy::HWY_NAMESPACE::Sub;
25+ using hwy::HWY_NAMESPACE::Div;
26+ using hwy::HWY_NAMESPACE::Min;
27+ using hwy::HWY_NAMESPACE::Max;
28+ using hwy::HWY_NAMESPACE::Mul;
29+ using hwy::HWY_NAMESPACE::MulAdd;
30+ using hwy::HWY_NAMESPACE::MulSub;
31+ using hwy::HWY_NAMESPACE::NegMulAdd;
32+ using hwy::HWY_NAMESPACE::Rebind;
33+ using hwy::HWY_NAMESPACE::ConvertTo;
34+ using hwy::HWY_NAMESPACE::DemoteTo;
35+ using hwy::HWY_NAMESPACE::BitCast;
36+ using hwy::HWY_NAMESPACE::ShiftLeft;
37+ using hwy::HWY_NAMESPACE::ShiftRight;
38+ using hwy::float32_t ;
39+
40+ FixedTag<float32_t , 4 > fixedFloatTag;
41+
42+ const Vec<FixedTag<float32_t , 4 >> expTab[8 ] =
43+ {
44+ Set (fixedFloatTag, 1 .f ),
45+ Set (fixedFloatTag, 0 .0416598916054f ),
46+ Set (fixedFloatTag, 0 .500000596046f ),
47+ Set (fixedFloatTag, 0 .0014122662833f ),
48+ Set (fixedFloatTag, 1 .00000011921f ),
49+ Set (fixedFloatTag, 0 .00833693705499f ),
50+ Set (fixedFloatTag, 0 .166665703058f ),
51+ Set (fixedFloatTag, 0 .000195780929062f ),
52+ };
53+
54+ const Vec<FixedTag<float32_t , 4 >> logTab[8 ] =
55+ {
56+ Set (fixedFloatTag, -2 .29561495781f ),
57+ Set (fixedFloatTag, -2 .47071170807f ),
58+ Set (fixedFloatTag, -5 .68692588806f ),
59+ Set (fixedFloatTag, -0 .165253549814f ),
60+ Set (fixedFloatTag, 5 .17591238022f ),
61+ Set (fixedFloatTag, 0 .844007015228f ),
62+ Set (fixedFloatTag, 4 .58445882797f ),
63+ Set (fixedFloatTag, 0 .0141278216615f ),
64+ };
65+
66+ inline Vec<FixedTag<float32_t , 4 >>
67+ TaylorPolyExp (Vec<FixedTag<float32_t , 4 >> x, const Vec<FixedTag<float32_t , 4 >> *coeffs) {
68+ FixedTag<float32_t , 4 > d;
69+ using VF32 = Vec<decltype (d)>;
70+
71+ VF32 A = MulAdd (coeffs[4 ], x, coeffs[0 ]);
72+ VF32 B = MulAdd (coeffs[6 ], x, coeffs[2 ]);
73+ VF32 C = MulAdd (coeffs[5 ], x, coeffs[1 ]);
74+ VF32 D = MulAdd (coeffs[7 ], x, coeffs[3 ]);
75+ VF32 x2 = Mul (x, x);
76+ VF32 x4 = Mul (x2, x2);
77+ VF32 res = MulAdd (MulAdd (D, x2, C), x4, MulAdd (B, x2, A));
78+ return res;
79+ }
80+
81+ inline Vec<FixedTag<float32_t , 4 >> ExpF32 (Vec<FixedTag<float32_t , 4 >> x) {
82+ FixedTag<float32_t , 4 > d;
83+ using VF32 = Vec<decltype (d)>;
84+ Rebind<int32_t , decltype (d)> s32;
85+ using VS32 = Vec<decltype (s32)>;
86+ Rebind<float32_t , decltype (s32)> fs32;
87+ static const VF32 CONST_LN2 = Set (d, 0 .6931471805f ); // ln(2)
88+ static const VF32 CONST_INV_LN2 = Set (d, 1 .4426950408f ); // 1/ln(2)
89+ // Perform range reduction [-log(2),log(2)]
90+ VS32 m = ConvertTo (s32, Mul (x, CONST_INV_LN2));
91+ auto val = NegMulAdd (ConvertTo (fs32, m), CONST_LN2, x);
92+ // Polynomial Approximation
93+ auto poly = TaylorPolyExp (val, &expTab[0 ]);
94+ // Reconstruct
95+ poly = BitCast (fs32, Add (BitCast (s32, poly), ShiftLeft<23 >(m)));
96+ return poly;
97+ }
98+
99+ inline Vec<FixedTag<float32_t , 4 >> LogF32 (Vec<FixedTag<float32_t , 4 >> x) {
100+ FixedTag<float32_t , 4 > d;
101+ using VF32 = Vec<decltype (d)>;
102+ FixedTag<int32_t , 4 > s32;
103+ Rebind<int32_t , decltype (d)> sb32;
104+ Rebind<float32_t , decltype (s32)> fli32;
105+ using VS32 = Vec<decltype (s32)>;
106+ static const VS32 CONST_127 = Set (s32, 127 ); // 127
107+ static const VF32 CONST_LN2 = Set (d, 0 .6931471805f ); // ln(2)
108+ // Extract exponent
109+ VS32 m = Sub (BitCast (s32, ShiftRight<23 >(BitCast (s32, BitCast (sb32, x)))), CONST_127);
110+ VF32 val = BitCast (fli32,
111+ Sub (BitCast (sb32, x), ShiftLeft<23 >(m)));
112+ // Polynomial Approximation
113+ VF32 poly = TaylorPolyExp (val, &logTab[0 ]);
114+ // Reconstruct
115+ poly = MulAdd (ConvertTo (fli32, m), CONST_LN2, poly);
116+ return poly;
117+ }
118+
119+ inline Vec<FixedTag<float32_t , 4 >>
120+ PowHLG (Vec<FixedTag<float32_t , 4 >> val, Vec<FixedTag<float32_t , 4 >> n) {
121+ return ExpF32 (Mul (n, LogF32 (val)));
122+ }
123+
124+ inline Vec<FixedTag<float32_t , 4 >> HLGEotf (Vec<FixedTag<float32_t , 4 >> v) {
125+ v = Max (Zero (fixedFloatTag), v);
126+ using VF32 = Vec<decltype (fixedFloatTag)>;
127+ VF32 a = Set (fixedFloatTag, 0 .17883277f );
128+ VF32 b = Set (fixedFloatTag, 0 .28466892f );
129+ VF32 c = Set (fixedFloatTag, 0 .55991073f );
130+ VF32 mm = Set (fixedFloatTag, 0 .5f );
131+ const auto cmp = v < mm;
132+ auto branch1 = Div (Mul (v, v), Set (fixedFloatTag, 3 .f ));
133+ auto branch2 = Div (ExpF32 (Add (Div (Sub (v, c), a), b)), Set (fixedFloatTag, 12 .0f ));
134+ return IfThenElse (cmp, branch1, branch2);
135+ }
136+
137+ float Evaluate (float v) {
138+ // Spec: http://www.arib.or.jp/english/html/overview/doc/2-STD-B67v1_0.pdf
139+ v = std::max (0 .0f , v);
140+ constexpr float a = 0 .17883277f ;
141+ constexpr float b = 0 .28466892f ;
142+ constexpr float c = 0 .55991073f ;
143+ if (v <= 0 .5f )
144+ v = v * v / 3 .0f ;
145+ else
146+ v = (exp ((v - c) / a) + b) / 12 .f ;
147+ return v;
148+ }
149+
150+ template <class D , typename T = Vec<D>>
151+ inline T bt2020HLGGammaCorrection (const D d, T color) {
152+ static const float betaRec2020 = 0 .018053968510807f ;
153+ static const float alphaRec2020 = 1 .09929682680944f ;
154+ T bt2020 = Set (d, betaRec2020);
155+ T alpha2020 = Set (d, alphaRec2020);
156+ const auto cmp = color < bt2020;
157+ const auto branch1 = Mul (color, Set (d, 4 .5f ));
158+ const auto branch2 =
159+ Sub (Mul (alpha2020, PowHLG (color, Set (d, 0 .45f ))), Sub (alpha2020, Set (d, 1 .0f )));
160+ return IfThenElse (cmp, branch1, branch2);
161+ }
162+
163+ float bt2020HLGGammaCorrection (float linear) {
164+ static const float betaRec2020 = 0 .018053968510807f ;
165+ static const float alphaRec2020 = 1 .09929682680944f ;
166+ if (0 <= betaRec2020 && linear < betaRec2020) {
167+ return 4 .5f * linear;
168+ } else if (betaRec2020 <= linear && linear < 1 ) {
169+ return alphaRec2020 * powf (linear, 0 .45f ) - (alphaRec2020 - 1 .0f );
170+ }
171+ return linear;
172+ }
173+
174+ void TransferROWHLGU8 (uint8_t *data, float maxColors) {
175+ auto r = (float ) data[0 ] / (float ) maxColors;
176+ auto g = (float ) data[1 ] / (float ) maxColors;
177+ auto b = (float ) data[2 ] / (float ) maxColors;
178+ data[0 ] = (uint8_t ) std::clamp (
179+ (float ) bt2020HLGGammaCorrection (Evaluate (r)) * maxColors, 0 .0f , maxColors);
180+ data[1 ] = (uint8_t ) std::clamp (
181+ (float ) bt2020HLGGammaCorrection (Evaluate (g)) * maxColors, 0 .0f , maxColors);
182+ data[2 ] = (uint8_t ) std::clamp (
183+ (float ) bt2020HLGGammaCorrection (Evaluate (b)) * maxColors, 0 .0f , maxColors);
184+ }
185+
186+ void TransferROWHLGU16 (uint16_t *data, float maxColors) {
187+ auto r = (float ) data[0 ] / (float ) maxColors;
188+ auto g = (float ) data[1 ] / (float ) maxColors;
189+ auto b = (float ) data[2 ] / (float ) maxColors;
190+ data[0 ] = (uint16_t ) float_to_half ((float ) bt2020HLGGammaCorrection (Evaluate (r)));
191+ data[1 ] = (uint16_t ) float_to_half ((float ) bt2020HLGGammaCorrection (Evaluate (g)));
192+ data[2 ] = (uint16_t ) float_to_half ((float ) bt2020HLGGammaCorrection (Evaluate (b)));
193+ }
194+
195+ void ProcessHLGF16Row (uint16_t *HWY_RESTRICT data, int width, float maxColors) {
196+ const FixedTag<float32_t , 4 > df32;
197+ FixedTag<uint16_t , 4 > d;
198+
199+ const Rebind<uint32_t , decltype (d)> signed32;
200+ const Rebind<int32_t , decltype (df32)> floatToSigned;
201+ const Rebind<uint8_t , FixedTag<float32_t , 4 >> rebindOrigin;
202+
203+ const Rebind<float32_t , decltype (signed32)> rebind32;
204+ const FixedTag<uint16_t , 4 > du16;
205+ const Rebind<float16_t , decltype (df32)> rebind16;
206+
207+ using VU16 = Vec<decltype (d)>;
208+ using VF32 = Vec<decltype (df32)>;
209+
210+ auto ptr16 = reinterpret_cast <uint16_t *>(data);
211+
212+ VF32 vColors = Set (df32, (float ) maxColors);
213+
214+ int pixels = 4 ;
215+
216+ int x = 0 ;
217+ for (x = 0 ; x + pixels < width; x += pixels) {
218+ VU16 RURow;
219+ VU16 GURow;
220+ VU16 BURow;
221+ VU16 AURow;
222+ LoadInterleaved4 (d, reinterpret_cast <uint16_t *>(ptr16), RURow, GURow, BURow,
223+ AURow);
224+ VF32 r32 = BitCast (rebind32, PromoteTo (signed32, RURow));
225+ VF32 g32 = BitCast (rebind32, PromoteTo (signed32, GURow));
226+ VF32 b32 = BitCast (rebind32, PromoteTo (signed32, BURow));
227+
228+ VF32 pqR = bt2020HLGGammaCorrection (df32, HLGEotf (r32));
229+ VF32 pqG = bt2020HLGGammaCorrection (df32, HLGEotf (g32));
230+ VF32 pqB = bt2020HLGGammaCorrection (df32, HLGEotf (b32));
231+
232+ VU16 rNew = BitCast (du16, DemoteTo (rebind16, pqR));
233+ VU16 gNew = BitCast (du16, DemoteTo (rebind16, pqG));
234+ VU16 bNew = BitCast (du16, DemoteTo (rebind16, pqB));
235+
236+ StoreInterleaved4 (rNew, gNew , bNew, AURow, d,
237+ reinterpret_cast <uint8_t *>(ptr16));
238+ ptr16 += 4 * 4 ;
239+ }
240+
241+ for (; x < width; ++x) {
242+ TransferROWHLGU16 (reinterpret_cast <uint16_t *>(ptr16), maxColors);
243+ ptr16 += 4 ;
244+ }
245+ }
246+
247+ void ProcessHLGu8Row (uint8_t *HWY_RESTRICT data, int width, float maxColors) {
248+ const FixedTag<float32_t , 4 > df32;
249+ hwy::HWY_NAMESPACE::FixedTag<uint8_t , 4 > d;
250+
251+ const Rebind<uint32_t , decltype (d)> signed32;
252+ const Rebind<int32_t , decltype (df32)> floatToSigned;
253+ const Rebind<uint8_t , FixedTag<float32_t , 4 >> rebindOrigin;
254+
255+ const Rebind<float32_t , decltype (signed32)> rebind32;
256+
257+ using VU16 = Vec<decltype (d)>;
258+ using VF32 = Vec<decltype (df32)>;
259+
260+ auto ptr16 = reinterpret_cast <uint8_t *>(data);
261+
262+ VF32 vColors = Set (df32, (float ) maxColors);
263+
264+ int pixels = 4 ;
265+
266+ int x = 0 ;
267+ for (x = 0 ; x + pixels < width; x += pixels) {
268+ VU16 RURow;
269+ VU16 GURow;
270+ VU16 BURow;
271+ VU16 AURow;
272+ LoadInterleaved4 (d, reinterpret_cast <uint8_t *>(ptr16), RURow, GURow, BURow, AURow);
273+ VF32 r32 = Div (ConvertTo (rebind32, PromoteTo (signed32, RURow)), vColors);
274+ VF32 g32 = Div (ConvertTo (rebind32, PromoteTo (signed32, GURow)), vColors);
275+ VF32 b32 = Div (ConvertTo (rebind32, PromoteTo (signed32, BURow)), vColors);
276+
277+ VF32 pqR = Max (
278+ Min (Mul (bt2020HLGGammaCorrection (df32, HLGEotf (r32)), vColors), vColors),
279+ Zero (df32));
280+ VF32 pqG = Max (
281+ Min (Mul (bt2020HLGGammaCorrection (df32, HLGEotf (g32)), vColors), vColors),
282+ Zero (df32));
283+ VF32 pqB = Max (
284+ Min (Mul (bt2020HLGGammaCorrection (df32, HLGEotf (b32)), vColors), vColors),
285+ Zero (df32));
286+
287+ VU16 rNew = DemoteTo (rebindOrigin, ConvertTo (floatToSigned, pqR));
288+ VU16 gNew = DemoteTo (rebindOrigin, ConvertTo (floatToSigned, pqG));
289+ VU16 bNew = DemoteTo (rebindOrigin, ConvertTo (floatToSigned, pqB));
290+
291+ StoreInterleaved4 (rNew, gNew , bNew, AURow, d,
292+ reinterpret_cast <uint8_t *>(ptr16));
293+ ptr16 += 4 * 4 ;
294+ }
295+
296+ for (; x < width; ++x) {
297+ TransferROWHLGU8 (reinterpret_cast <uint8_t *>(ptr16), maxColors);
298+ ptr16 += 4 ;
299+ }
300+ }
301+
302+ void
303+ ProcessHLG (uint8_t *data, bool halfFloats, int stride, int width, int height, int depth) {
304+ float maxColors = powf (2 , (float ) depth) - 1 ;
305+ ThreadPool pool;
306+ std::vector<std::future<void >> results;
307+ for (int y = 0 ; y < height; ++y) {
308+ if (halfFloats) {
309+ auto ptr16 = reinterpret_cast <uint16_t *>(data + y * stride);
310+ auto r = pool.enqueue (ProcessHLGF16Row, reinterpret_cast <uint16_t *>(ptr16),
311+ width,
312+ (float ) maxColors);
313+ results.push_back (std::move (r));
314+ } else {
315+ auto ptr16 = reinterpret_cast <uint8_t *>(data + y * stride);
316+ auto r = pool.enqueue (ProcessHLGu8Row, reinterpret_cast <uint8_t *>(ptr16),
317+ width,
318+ (float ) maxColors);
319+ results.push_back (std::move (r));
320+ }
321+ }
322+
323+ for (auto &result: results) {
324+ result.wait ();
325+ }
326+ }
327+
328+ }
329+ }
330+
331+ HWY_AFTER_NAMESPACE ();
332+
333+ #if HWY_ONCE
334+ namespace coder {
335+ HWY_EXPORT (ProcessHLG);
336+ HWY_DLLEXPORT void
337+ ProcessHLG (uint8_t *data, bool halfFloats, int stride, int width, int height, int depth) {
338+ if (halfFloats) {
339+ // auto ptr16 = reinterpret_cast<uint16_t *>(data + y * stride);
340+ // if (halfFloats) {
341+ // HWY_DYNAMIC_DISPATCH(ProcessHLGu8Row)(reinterpret_cast<uint16_t *>(ptr16), width);
342+ // }
343+ } else {
344+ HWY_DYNAMIC_DISPATCH (ProcessHLG)(data, halfFloats, stride, width, height, depth);
345+ }
346+ }
347+ }
348+ #endif
0 commit comments