Skip to content

Commit 96bb04f

Browse files
committed
Added HLG
1 parent 8c18b1a commit 96bb04f

File tree

5 files changed

+373
-4
lines changed

5 files changed

+373
-4
lines changed

avif-coder/src/main/cpp/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@ add_library( # Sets the name of the library.
1818
icc/cmsgmt.c icc/cmshalf.c icc/cmsintrp.c icc/cmsio0.c icc/cmsio1.c icc/cmslut.c icc/cmsmd5.c icc/cmsmtrx.c icc/cmsnamed.c
1919
icc/cmsopt.c icc/cmspack.c icc/cmspcs.c icc/cmsplugin.c icc/cmsps2.c icc/cmssamp.c icc/cmssm.c icc/cmstypes.c icc/cmsvirt.c
2020
icc/cmswtpnt.c icc/cmsxform.c colorspace.cpp HalfFloats.cpp
21-
RgbaF16bitToNBitU16.cpp RgbaF16bitNBitU8.cpp Rgb1010102.cpp PerceptualQuantinizer.cpp ThreadPool.hpp CopyUnalignedRGBA.cpp)
21+
RgbaF16bitToNBitU16.cpp RgbaF16bitNBitU8.cpp Rgb1010102.cpp PerceptualQuantinizer.cpp
22+
ThreadPool.hpp CopyUnalignedRGBA.cpp HLG.cpp)
2223

2324
add_library(libaom STATIC IMPORTED)
2425
add_library(libx265 STATIC IMPORTED)

avif-coder/src/main/cpp/HLG.cpp

Lines changed: 348 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,348 @@
1+
//
2+
// Created by Radzivon Bartoshyk on 15/09/2023.
3+
//
4+
5+
#include "HLG.h"
6+
#include "ThreadPool.hpp"
7+
#include "HalfFloats.h"
8+
9+
#undef HWY_TARGET_INCLUDE
10+
#define HWY_TARGET_INCLUDE "HLG.cpp"
11+
12+
#include "hwy/foreach_target.h"
13+
#include "hwy/highway.h"
14+
15+
HWY_BEFORE_NAMESPACE();
16+
17+
namespace coder {
18+
namespace HWY_NAMESPACE {
19+
20+
using hwy::HWY_NAMESPACE::Set;
21+
using hwy::HWY_NAMESPACE::FixedTag;
22+
using hwy::HWY_NAMESPACE::Vec;
23+
using hwy::HWY_NAMESPACE::Add;
24+
using hwy::HWY_NAMESPACE::Sub;
25+
using hwy::HWY_NAMESPACE::Div;
26+
using hwy::HWY_NAMESPACE::Min;
27+
using hwy::HWY_NAMESPACE::Max;
28+
using hwy::HWY_NAMESPACE::Mul;
29+
using hwy::HWY_NAMESPACE::MulAdd;
30+
using hwy::HWY_NAMESPACE::MulSub;
31+
using hwy::HWY_NAMESPACE::NegMulAdd;
32+
using hwy::HWY_NAMESPACE::Rebind;
33+
using hwy::HWY_NAMESPACE::ConvertTo;
34+
using hwy::HWY_NAMESPACE::DemoteTo;
35+
using hwy::HWY_NAMESPACE::BitCast;
36+
using hwy::HWY_NAMESPACE::ShiftLeft;
37+
using hwy::HWY_NAMESPACE::ShiftRight;
38+
using hwy::float32_t;
39+
40+
FixedTag<float32_t, 4> fixedFloatTag;
41+
42+
const Vec<FixedTag<float32_t, 4>> expTab[8] =
43+
{
44+
Set(fixedFloatTag, 1.f),
45+
Set(fixedFloatTag, 0.0416598916054f),
46+
Set(fixedFloatTag, 0.500000596046f),
47+
Set(fixedFloatTag, 0.0014122662833f),
48+
Set(fixedFloatTag, 1.00000011921f),
49+
Set(fixedFloatTag, 0.00833693705499f),
50+
Set(fixedFloatTag, 0.166665703058f),
51+
Set(fixedFloatTag, 0.000195780929062f),
52+
};
53+
54+
const Vec<FixedTag<float32_t, 4>> logTab[8] =
55+
{
56+
Set(fixedFloatTag, -2.29561495781f),
57+
Set(fixedFloatTag, -2.47071170807f),
58+
Set(fixedFloatTag, -5.68692588806f),
59+
Set(fixedFloatTag, -0.165253549814f),
60+
Set(fixedFloatTag, 5.17591238022f),
61+
Set(fixedFloatTag, 0.844007015228f),
62+
Set(fixedFloatTag, 4.58445882797f),
63+
Set(fixedFloatTag, 0.0141278216615f),
64+
};
65+
66+
inline Vec<FixedTag<float32_t, 4>>
67+
TaylorPolyExp(Vec<FixedTag<float32_t, 4>> x, const Vec<FixedTag<float32_t, 4>> *coeffs) {
68+
FixedTag<float32_t, 4> d;
69+
using VF32 = Vec<decltype(d)>;
70+
71+
VF32 A = MulAdd(coeffs[4], x, coeffs[0]);
72+
VF32 B = MulAdd(coeffs[6], x, coeffs[2]);
73+
VF32 C = MulAdd(coeffs[5], x, coeffs[1]);
74+
VF32 D = MulAdd(coeffs[7], x, coeffs[3]);
75+
VF32 x2 = Mul(x, x);
76+
VF32 x4 = Mul(x2, x2);
77+
VF32 res = MulAdd(MulAdd(D, x2, C), x4, MulAdd(B, x2, A));
78+
return res;
79+
}
80+
81+
inline Vec<FixedTag<float32_t, 4>> ExpF32(Vec<FixedTag<float32_t, 4>> x) {
82+
FixedTag<float32_t, 4> d;
83+
using VF32 = Vec<decltype(d)>;
84+
Rebind<int32_t, decltype(d)> s32;
85+
using VS32 = Vec<decltype(s32)>;
86+
Rebind<float32_t, decltype(s32)> fs32;
87+
static const VF32 CONST_LN2 = Set(d, 0.6931471805f); // ln(2)
88+
static const VF32 CONST_INV_LN2 = Set(d, 1.4426950408f); // 1/ln(2)
89+
// Perform range reduction [-log(2),log(2)]
90+
VS32 m = ConvertTo(s32, Mul(x, CONST_INV_LN2));
91+
auto val = NegMulAdd(ConvertTo(fs32, m), CONST_LN2, x);
92+
// Polynomial Approximation
93+
auto poly = TaylorPolyExp(val, &expTab[0]);
94+
// Reconstruct
95+
poly = BitCast(fs32, Add(BitCast(s32, poly), ShiftLeft<23>(m)));
96+
return poly;
97+
}
98+
99+
inline Vec<FixedTag<float32_t, 4>> LogF32(Vec<FixedTag<float32_t, 4>> x) {
100+
FixedTag<float32_t, 4> d;
101+
using VF32 = Vec<decltype(d)>;
102+
FixedTag<int32_t, 4> s32;
103+
Rebind<int32_t, decltype(d)> sb32;
104+
Rebind<float32_t, decltype(s32)> fli32;
105+
using VS32 = Vec<decltype(s32)>;
106+
static const VS32 CONST_127 = Set(s32, 127); // 127
107+
static const VF32 CONST_LN2 = Set(d, 0.6931471805f); // ln(2)
108+
// Extract exponent
109+
VS32 m = Sub(BitCast(s32, ShiftRight<23>(BitCast(s32, BitCast(sb32, x)))), CONST_127);
110+
VF32 val = BitCast(fli32,
111+
Sub(BitCast(sb32, x), ShiftLeft<23>(m)));
112+
// Polynomial Approximation
113+
VF32 poly = TaylorPolyExp(val, &logTab[0]);
114+
// Reconstruct
115+
poly = MulAdd(ConvertTo(fli32, m), CONST_LN2, poly);
116+
return poly;
117+
}
118+
119+
inline Vec<FixedTag<float32_t, 4>>
120+
PowHLG(Vec<FixedTag<float32_t, 4>> val, Vec<FixedTag<float32_t, 4>> n) {
121+
return ExpF32(Mul(n, LogF32(val)));
122+
}
123+
124+
inline Vec<FixedTag<float32_t, 4>> HLGEotf(Vec<FixedTag<float32_t, 4>> v) {
125+
v = Max(Zero(fixedFloatTag), v);
126+
using VF32 = Vec<decltype(fixedFloatTag)>;
127+
VF32 a = Set(fixedFloatTag, 0.17883277f);
128+
VF32 b = Set(fixedFloatTag, 0.28466892f);
129+
VF32 c = Set(fixedFloatTag, 0.55991073f);
130+
VF32 mm = Set(fixedFloatTag, 0.5f);
131+
const auto cmp = v < mm;
132+
auto branch1 = Div(Mul(v, v), Set(fixedFloatTag, 3.f));
133+
auto branch2 = Div(ExpF32(Add(Div(Sub(v, c), a), b)), Set(fixedFloatTag, 12.0f));
134+
return IfThenElse(cmp, branch1, branch2);
135+
}
136+
137+
float Evaluate(float v) {
138+
// Spec: http://www.arib.or.jp/english/html/overview/doc/2-STD-B67v1_0.pdf
139+
v = std::max(0.0f, v);
140+
constexpr float a = 0.17883277f;
141+
constexpr float b = 0.28466892f;
142+
constexpr float c = 0.55991073f;
143+
if (v <= 0.5f)
144+
v = v * v / 3.0f;
145+
else
146+
v = (exp((v - c) / a) + b) / 12.f;
147+
return v;
148+
}
149+
150+
template<class D, typename T = Vec<D>>
151+
inline T bt2020HLGGammaCorrection(const D d, T color) {
152+
static const float betaRec2020 = 0.018053968510807f;
153+
static const float alphaRec2020 = 1.09929682680944f;
154+
T bt2020 = Set(d, betaRec2020);
155+
T alpha2020 = Set(d, alphaRec2020);
156+
const auto cmp = color < bt2020;
157+
const auto branch1 = Mul(color, Set(d, 4.5f));
158+
const auto branch2 =
159+
Sub(Mul(alpha2020, PowHLG(color, Set(d, 0.45f))), Sub(alpha2020, Set(d, 1.0f)));
160+
return IfThenElse(cmp, branch1, branch2);
161+
}
162+
163+
float bt2020HLGGammaCorrection(float linear) {
164+
static const float betaRec2020 = 0.018053968510807f;
165+
static const float alphaRec2020 = 1.09929682680944f;
166+
if (0 <= betaRec2020 && linear < betaRec2020) {
167+
return 4.5f * linear;
168+
} else if (betaRec2020 <= linear && linear < 1) {
169+
return alphaRec2020 * powf(linear, 0.45f) - (alphaRec2020 - 1.0f);
170+
}
171+
return linear;
172+
}
173+
174+
void TransferROWHLGU8(uint8_t *data, float maxColors) {
175+
auto r = (float) data[0] / (float) maxColors;
176+
auto g = (float) data[1] / (float) maxColors;
177+
auto b = (float) data[2] / (float) maxColors;
178+
data[0] = (uint8_t) std::clamp(
179+
(float) bt2020HLGGammaCorrection(Evaluate(r)) * maxColors, 0.0f, maxColors);
180+
data[1] = (uint8_t) std::clamp(
181+
(float) bt2020HLGGammaCorrection(Evaluate(g)) * maxColors, 0.0f, maxColors);
182+
data[2] = (uint8_t) std::clamp(
183+
(float) bt2020HLGGammaCorrection(Evaluate(b)) * maxColors, 0.0f, maxColors);
184+
}
185+
186+
void TransferROWHLGU16(uint16_t *data, float maxColors) {
187+
auto r = (float) data[0] / (float) maxColors;
188+
auto g = (float) data[1] / (float) maxColors;
189+
auto b = (float) data[2] / (float) maxColors;
190+
data[0] = (uint16_t) float_to_half((float) bt2020HLGGammaCorrection(Evaluate(r)));
191+
data[1] = (uint16_t) float_to_half((float) bt2020HLGGammaCorrection(Evaluate(g)));
192+
data[2] = (uint16_t) float_to_half((float) bt2020HLGGammaCorrection(Evaluate(b)));
193+
}
194+
195+
void ProcessHLGF16Row(uint16_t *HWY_RESTRICT data, int width, float maxColors) {
196+
const FixedTag<float32_t, 4> df32;
197+
FixedTag<uint16_t, 4> d;
198+
199+
const Rebind<uint32_t, decltype(d)> signed32;
200+
const Rebind<int32_t, decltype(df32)> floatToSigned;
201+
const Rebind<uint8_t, FixedTag<float32_t, 4>> rebindOrigin;
202+
203+
const Rebind<float32_t, decltype(signed32)> rebind32;
204+
const FixedTag<uint16_t, 4> du16;
205+
const Rebind<float16_t, decltype(df32)> rebind16;
206+
207+
using VU16 = Vec<decltype(d)>;
208+
using VF32 = Vec<decltype(df32)>;
209+
210+
auto ptr16 = reinterpret_cast<uint16_t *>(data);
211+
212+
VF32 vColors = Set(df32, (float) maxColors);
213+
214+
int pixels = 4;
215+
216+
int x = 0;
217+
for (x = 0; x + pixels < width; x += pixels) {
218+
VU16 RURow;
219+
VU16 GURow;
220+
VU16 BURow;
221+
VU16 AURow;
222+
LoadInterleaved4(d, reinterpret_cast<uint16_t *>(ptr16), RURow, GURow, BURow,
223+
AURow);
224+
VF32 r32 = BitCast(rebind32, PromoteTo(signed32, RURow));
225+
VF32 g32 = BitCast(rebind32, PromoteTo(signed32, GURow));
226+
VF32 b32 = BitCast(rebind32, PromoteTo(signed32, BURow));
227+
228+
VF32 pqR = bt2020HLGGammaCorrection(df32, HLGEotf(r32));
229+
VF32 pqG = bt2020HLGGammaCorrection(df32, HLGEotf(g32));
230+
VF32 pqB = bt2020HLGGammaCorrection(df32, HLGEotf(b32));
231+
232+
VU16 rNew = BitCast(du16, DemoteTo(rebind16, pqR));
233+
VU16 gNew = BitCast(du16, DemoteTo(rebind16, pqG));
234+
VU16 bNew = BitCast(du16, DemoteTo(rebind16, pqB));
235+
236+
StoreInterleaved4(rNew, gNew, bNew, AURow, d,
237+
reinterpret_cast<uint8_t *>(ptr16));
238+
ptr16 += 4 * 4;
239+
}
240+
241+
for (; x < width; ++x) {
242+
TransferROWHLGU16(reinterpret_cast<uint16_t *>(ptr16), maxColors);
243+
ptr16 += 4;
244+
}
245+
}
246+
247+
void ProcessHLGu8Row(uint8_t *HWY_RESTRICT data, int width, float maxColors) {
248+
const FixedTag<float32_t, 4> df32;
249+
hwy::HWY_NAMESPACE::FixedTag<uint8_t, 4> d;
250+
251+
const Rebind<uint32_t, decltype(d)> signed32;
252+
const Rebind<int32_t, decltype(df32)> floatToSigned;
253+
const Rebind<uint8_t, FixedTag<float32_t, 4>> rebindOrigin;
254+
255+
const Rebind<float32_t, decltype(signed32)> rebind32;
256+
257+
using VU16 = Vec<decltype(d)>;
258+
using VF32 = Vec<decltype(df32)>;
259+
260+
auto ptr16 = reinterpret_cast<uint8_t *>(data);
261+
262+
VF32 vColors = Set(df32, (float) maxColors);
263+
264+
int pixels = 4;
265+
266+
int x = 0;
267+
for (x = 0; x + pixels < width; x += pixels) {
268+
VU16 RURow;
269+
VU16 GURow;
270+
VU16 BURow;
271+
VU16 AURow;
272+
LoadInterleaved4(d, reinterpret_cast<uint8_t *>(ptr16), RURow, GURow, BURow, AURow);
273+
VF32 r32 = Div(ConvertTo(rebind32, PromoteTo(signed32, RURow)), vColors);
274+
VF32 g32 = Div(ConvertTo(rebind32, PromoteTo(signed32, GURow)), vColors);
275+
VF32 b32 = Div(ConvertTo(rebind32, PromoteTo(signed32, BURow)), vColors);
276+
277+
VF32 pqR = Max(
278+
Min(Mul(bt2020HLGGammaCorrection(df32, HLGEotf(r32)), vColors), vColors),
279+
Zero(df32));
280+
VF32 pqG = Max(
281+
Min(Mul(bt2020HLGGammaCorrection(df32, HLGEotf(g32)), vColors), vColors),
282+
Zero(df32));
283+
VF32 pqB = Max(
284+
Min(Mul(bt2020HLGGammaCorrection(df32, HLGEotf(b32)), vColors), vColors),
285+
Zero(df32));
286+
287+
VU16 rNew = DemoteTo(rebindOrigin, ConvertTo(floatToSigned, pqR));
288+
VU16 gNew = DemoteTo(rebindOrigin, ConvertTo(floatToSigned, pqG));
289+
VU16 bNew = DemoteTo(rebindOrigin, ConvertTo(floatToSigned, pqB));
290+
291+
StoreInterleaved4(rNew, gNew, bNew, AURow, d,
292+
reinterpret_cast<uint8_t *>(ptr16));
293+
ptr16 += 4 * 4;
294+
}
295+
296+
for (; x < width; ++x) {
297+
TransferROWHLGU8(reinterpret_cast<uint8_t *>(ptr16), maxColors);
298+
ptr16 += 4;
299+
}
300+
}
301+
302+
void
303+
ProcessHLG(uint8_t *data, bool halfFloats, int stride, int width, int height, int depth) {
304+
float maxColors = powf(2, (float) depth) - 1;
305+
ThreadPool pool;
306+
std::vector<std::future<void>> results;
307+
for (int y = 0; y < height; ++y) {
308+
if (halfFloats) {
309+
auto ptr16 = reinterpret_cast<uint16_t *>(data + y * stride);
310+
auto r = pool.enqueue(ProcessHLGF16Row, reinterpret_cast<uint16_t *>(ptr16),
311+
width,
312+
(float) maxColors);
313+
results.push_back(std::move(r));
314+
} else {
315+
auto ptr16 = reinterpret_cast<uint8_t *>(data + y * stride);
316+
auto r = pool.enqueue(ProcessHLGu8Row, reinterpret_cast<uint8_t *>(ptr16),
317+
width,
318+
(float) maxColors);
319+
results.push_back(std::move(r));
320+
}
321+
}
322+
323+
for (auto &result: results) {
324+
result.wait();
325+
}
326+
}
327+
328+
}
329+
}
330+
331+
HWY_AFTER_NAMESPACE();
332+
333+
#if HWY_ONCE
334+
namespace coder {
335+
HWY_EXPORT(ProcessHLG);
336+
HWY_DLLEXPORT void
337+
ProcessHLG(uint8_t *data, bool halfFloats, int stride, int width, int height, int depth) {
338+
if (halfFloats) {
339+
// auto ptr16 = reinterpret_cast<uint16_t *>(data + y * stride);
340+
// if (halfFloats) {
341+
// HWY_DYNAMIC_DISPATCH(ProcessHLGu8Row)(reinterpret_cast<uint16_t *>(ptr16), width);
342+
// }
343+
} else {
344+
HWY_DYNAMIC_DISPATCH(ProcessHLG)(data, halfFloats, stride, width, height, depth);
345+
}
346+
}
347+
}
348+
#endif

avif-coder/src/main/cpp/HLG.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
//
2+
// Created by Radzivon Bartoshyk on 15/09/2023.
3+
//
4+
5+
#ifndef AVIF_HLG_H
6+
#define AVIF_HLG_H
7+
8+
#include <cstdint>
9+
10+
namespace coder {
11+
void ProcessHLG(uint8_t *data, bool halfFloats, int stride, int width, int height, int depth);
12+
}
13+
14+
#endif //AVIF_HLG_H

avif-coder/src/main/cpp/PerceptualQuantinizer.cpp

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,6 @@ namespace coder {
8686
return std::min(2.3f * pow(v, 2.8f), v / 5.0f + 0.8f);
8787
}
8888

89-
9089
float ToLinearPQ(float v) {
9190
v = fmax(0.0f, v);
9291
float m1 = (2610.0f / 4096.0f) / 4.0f;
@@ -113,7 +112,6 @@ namespace coder {
113112
data[2] = float_to_half((float) bt2020GammaCorrection(smpte.b * scale));
114113
}
115114

116-
117115
void TransferROWU8(uint8_t *data, float maxColors) {
118116
auto r = (float) data[0] / (float) maxColors;
119117
auto g = (float) data[1] / (float) maxColors;
@@ -136,7 +134,7 @@ namespace coder {
136134
}
137135

138136
template<class D, typename T = Vec<D>>
139-
T bt2020GammaCorrection(const D d, T color) {
137+
inline T bt2020GammaCorrection(const D d, T color) {
140138
T bt2020 = Set(d, betaRec2020);
141139
T alpha2020 = Set(d, alphaRec2020);
142140
const auto cmp = color < bt2020;

0 commit comments

Comments
 (0)