Skip to content

Commit a0faeff

Browse files
committed
Parallelism
1 parent 7ec413d commit a0faeff

File tree

13 files changed

+166
-76
lines changed

13 files changed

+166
-76
lines changed

app/src/main/java/com/radzivon/bartoshyk/avif/MainActivity.kt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ class MainActivity : AppCompatActivity() {
9898
var allFiles = mutableListOf<String>()
9999
allFiles.addAll(allFiles2)
100100
allFiles.addAll(allFiles1)
101-
allFiles = allFiles.takeLast(4).toMutableList()
101+
// allFiles = allFiles.takeLast(4).toMutableList()
102102
for (file in allFiles) {
103103
try {
104104
val buffer = this@MainActivity.assets.open(file).source().buffer()

avif-coder/src/main/cpp/CMakeLists.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,9 @@ add_definitions(-DCMS_NO_REGISTER_KEYWORD -DSTB_IMAGE_RESIZE_IMPLEMENTATION)
6363

6464
set(CMAKE_ANDROID_API_MIN 24)
6565

66-
target_include_directories(coder PRIVATE ${CMAKE_SOURCE_DIR}/libheif ${CMAKE_SOURCE_DIR} ${CMAKE_SOURCE_DIR}/libyuv ${CMAKE_SOURCE_DIR}/colorspace)
66+
target_include_directories(coder PRIVATE ${CMAKE_SOURCE_DIR}/libheif
67+
${CMAKE_SOURCE_DIR} ${CMAKE_SOURCE_DIR}/libyuv
68+
${CMAKE_SOURCE_DIR}/colorspace ${CMAKE_SOURCE_DIR}/algo)
6769

6870
find_library( # Sets the name of the path variable.
6971
log-lib
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
#pragma once
2+
3+
#include <functional>
4+
#include <mutex>
5+
#include <queue>
6+
#include <thread>
7+
#include <vector>
8+
#include <type_traits>
9+
10+
namespace concurrency {
11+
12+
template<typename Function>
13+
struct function_traits;
14+
15+
template<typename R, typename... Args>
16+
struct function_traits<R(Args...)> {
17+
using result_type = R;
18+
};
19+
20+
template<typename Function, typename... Args>
21+
void parallel_for(const int numThreads, const int numIterations, Function &&func, Args &&... args) {
22+
static_assert(std::is_invocable_v<Function, int, Args...>, "func must take an int parameter for iteration id");
23+
24+
std::vector<std::thread> threads;
25+
26+
int segmentHeight = numIterations / numThreads;
27+
28+
auto parallelWorker = [&](int start, int end) {
29+
for (int y = start; y < end; ++y) {
30+
{
31+
std::invoke(func, y, std::forward<Args>(args)...);
32+
}
33+
}
34+
};
35+
36+
if (numThreads > 1) {
37+
// Launch N-1 worker threads
38+
for (int i = 1; i < numThreads; ++i) {
39+
int start = i * segmentHeight;
40+
int end = (i + 1) * segmentHeight;
41+
if (i == numThreads - 1) {
42+
end = numIterations;
43+
}
44+
threads.emplace_back(parallelWorker, start, end);
45+
}
46+
}
47+
48+
int start = 0;
49+
int end = segmentHeight;
50+
if (numThreads == 1) {
51+
end = numIterations;
52+
}
53+
parallelWorker(start, end);
54+
55+
// Join all threads
56+
for (auto &thread: threads) {
57+
if (thread.joinable()) {
58+
thread.join();
59+
}
60+
}
61+
}
62+
63+
template<typename Function, typename... Args>
64+
void parallel_for_with_thread_id(const int numThreads, const int numIterations, Function &&func, Args &&... args) {
65+
static_assert(std::is_invocable_v<Function, int, int, Args...>, "func must take an int parameter for threadId, and iteration Id");
66+
67+
std::vector<std::thread> threads;
68+
69+
int segmentHeight = numIterations / numThreads;
70+
71+
auto parallel_worker = [&](int threadId, int start, int end) {
72+
for (int y = start; y < end; ++y) {
73+
{
74+
std::invoke(func, threadId, y, std::forward<Args>(args)...);
75+
}
76+
}
77+
};
78+
79+
if (numThreads > 1) {
80+
// Launch N-1 worker threads
81+
for (int i = 1; i < numThreads; ++i) {
82+
int start = i * segmentHeight;
83+
int end = (i + 1) * segmentHeight;
84+
if (i == numThreads - 1) {
85+
end = numIterations;
86+
}
87+
threads.emplace_back(parallel_worker, i, start, end);
88+
}
89+
}
90+
91+
int start = 0;
92+
int end = segmentHeight;
93+
if (numThreads == 1) {
94+
end = numIterations;
95+
}
96+
parallel_worker(0, start, end);
97+
98+
// Join all threads
99+
for (auto &thread: threads) {
100+
if (thread.joinable()) {
101+
thread.join();
102+
}
103+
}
104+
}
105+
}

avif-coder/src/main/cpp/colorspace/HDRTransferAdapter.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
#include <thread>
3232
#include "imagebits/half.hpp"
3333
#include "Eigen/Eigen"
34+
#include "concurrency.hpp"
3435

3536
using namespace half_float;
3637
using namespace std;
@@ -878,13 +879,12 @@ namespace coder::HWY_NAMESPACE {
878879
Eigen::Matrix3f *conversion,
879880
const float gamma,
880881
const bool useChromaticAdaptation) {
881-
#pragma omp parallel for num_threads(7) schedule(dynamic)
882-
for (int y = 0; y < height; ++y) {
882+
concurrency::parallel_for(6, height, [&](int y) {
883883
ProcessCPURowHWY(data, y, halfFloats,
884884
stride, width, maxColors, gammaCorrection,
885885
function, curveToneMapper, conversion, gamma,
886886
useChromaticAdaptation);
887-
}
887+
});
888888
}
889889
}
890890

avif-coder/src/main/cpp/colorspace/colorspace.cpp

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
#include <vector>
3333
#include <thread>
3434
#include <android/log.h>
35+
#include "concurrency.hpp"
3536

3637
using namespace std;
3738

@@ -326,21 +327,19 @@ convertUseProfiles(std::vector<uint8_t> &srcVector, int stride,
326327

327328
int threadCount = clamp(min(static_cast<int>(std::thread::hardware_concurrency()),
328329
height * width / (256 * 256)), 1, 12);
329-
std::vector<std::thread> workers;
330330

331331
int segmentHeight = height / threadCount;
332332

333333
auto mOutputBuffer = iccARGB.data();
334334
auto mInputBuffer = srcVector.data();
335335
auto mTransform = ptrTransform.get();
336336

337-
#pragma omp parallel for num_threads(6) schedule(dynamic)
338-
for (int y = 0; y < height; ++y) {
337+
concurrency::parallel_for(6, height, [&](int y) {
339338
cmsDoTransformLineStride(mTransform,
340339
mInputBuffer + stride * y,
341340
mOutputBuffer + dstStride * y, width, 1,
342341
stride, stride, 0, 0);
343-
}
342+
});
344343

345344
srcVector = iccARGB;
346345
*newStride = dstStride;

avif-coder/src/main/cpp/imagebits/CopyUnalignedRGBA.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
#include <cstdint>
3131
#include <thread>
3232
#include <vector>
33+
#include "concurrency.hpp"
3334

3435
using namespace std;
3536

@@ -80,8 +81,7 @@ namespace coder::HWY_NAMESPACE {
8081
int width,
8182
int height,
8283
int pixelSize) {
83-
#pragma omp parallel for num_threads(3) schedule(dynamic)
84-
for (int y = 0; y < height; ++y) {
84+
concurrency::parallel_for(2, height, [&](int y) {
8585
if (pixelSize == 1) {
8686
const ScalableTag<uint8_t> du8;
8787
Copy1Row<decltype(du8), uint8_t>(du8,
@@ -112,7 +112,7 @@ namespace coder::HWY_NAMESPACE {
112112
reinterpret_cast<uint8_t *>(dst) +
113113
(y * dstStride)), width);
114114
}
115-
}
115+
});
116116
}
117117
}
118118

avif-coder/src/main/cpp/imagebits/RGBAlpha.cpp

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
*/
2828

2929
#include "RGBAlpha.h"
30+
#include "concurrency.hpp"
3031

3132
using namespace std;
3233

@@ -81,9 +82,7 @@ namespace coder::HWY_NAMESPACE {
8182
void UnpremultiplyRGBA_HWY(const uint8_t *src, int srcStride,
8283
uint8_t *dst, int dstStride, int width,
8384
int height) {
84-
#pragma omp parallel for num_threads(4) schedule(dynamic)
85-
for (int y = 0; y < height; ++y) {
86-
85+
concurrency::parallel_for(2, height, [&]( int y) {
8786
const FixedTag<uint8_t, 16> du8x16;
8887
const FixedTag<uint16_t, 8> du16x8;
8988
const FixedTag<uint8_t, 8> du8x8;
@@ -146,7 +145,7 @@ namespace coder::HWY_NAMESPACE {
146145
mSrc += 4;
147146
mDst += 4;
148147
}
149-
}
148+
});
150149
}
151150

152151
template<typename D, typename I = Vec<D>>
@@ -192,9 +191,7 @@ namespace coder::HWY_NAMESPACE {
192191
void PremultiplyRGBA_HWY(const uint8_t *src, int srcStride,
193192
uint8_t *dst, int dstStride, int width,
194193
int height) {
195-
#pragma omp parallel for num_threads(4) schedule(dynamic)
196-
for (int y = 0; y < height; ++y) {
197-
194+
concurrency::parallel_for(2, height, [&](int y) {
198195
const FixedTag<uint8_t, 16> du8x16;
199196
const FixedTag<uint16_t, 8> du16x8;
200197
const FixedTag<uint8_t, 8> du8x8;
@@ -233,7 +230,7 @@ namespace coder::HWY_NAMESPACE {
233230
mSrc += 4;
234231
mDst += 4;
235232
}
236-
}
233+
});
237234
}
238235
}
239236

avif-coder/src/main/cpp/imagebits/Rgb1010102.cpp

Lines changed: 9 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
#include <thread>
3232
#include <algorithm>
3333
#include "half.hpp"
34+
#include "concurrency.hpp"
3435

3536
using namespace std;
3637

@@ -372,16 +373,11 @@ namespace coder::HWY_NAMESPACE {
372373
auto src = reinterpret_cast<const uint8_t *>(source);
373374
auto dst = reinterpret_cast<uint8_t *>(destination);
374375

375-
#pragma omp parallel for num_threads(4) schedule(dynamic)
376-
for (int y = 0; y < height; ++y) {
377-
Rgba8ToRGBA1010102HWYRow(reinterpret_cast<const uint8_t *>(src +
378-
srcStride *
379-
y),
380-
reinterpret_cast<uint32_t *>(dst +
381-
dstStride *
382-
y),
376+
concurrency::parallel_for(2, height, [&](int y) {
377+
Rgba8ToRGBA1010102HWYRow(reinterpret_cast<const uint8_t *>(src + srcStride * y),
378+
reinterpret_cast<uint32_t *>(dst + dstStride * y),
383379
width, &permuteMap[0], attenuateAlpha);
384-
}
380+
});
385381
}
386382

387383
void
@@ -395,15 +391,11 @@ namespace coder::HWY_NAMESPACE {
395391
auto src = reinterpret_cast<const uint8_t *>(source);
396392
auto dst = reinterpret_cast<uint8_t *>(destination);
397393

398-
#pragma omp parallel for num_threads(4) schedule(dynamic)
399-
for (int y = 0; y < height; ++y) {
400-
F16ToRGBA1010102HWYRow(reinterpret_cast<const uint16_t *>(src +
401-
srcStride *
402-
y),
403-
reinterpret_cast<uint32_t *>(dst +
404-
dstStride * y),
394+
concurrency::parallel_for(2, height, [&](int y) {
395+
F16ToRGBA1010102HWYRow(reinterpret_cast<const uint16_t *>(src + srcStride * y),
396+
reinterpret_cast<uint32_t *>(dst + dstStride * y),
405397
width, &permuteMap[0]);
406-
}
398+
});
407399
}
408400

409401
// NOLINTNEXTLINE(google-readability-namespace-comments)

avif-coder/src/main/cpp/imagebits/Rgb565.cpp

Lines changed: 20 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
#include <vector>
3232
#include "half.hpp"
3333
#include <algorithm>
34+
#include "concurrency.hpp"
3435

3536
using namespace std;
3637

@@ -250,13 +251,11 @@ namespace coder::HWY_NAMESPACE {
250251
auto mSrc = reinterpret_cast<const uint8_t *>(sourceData);
251252
auto mDst = reinterpret_cast<uint8_t *>(dst);
252253

253-
#pragma omp parallel for num_threads(4) schedule(dynamic)
254-
for (int y = 0; y < height; ++y) {
255-
Rgba8To565HWYRow(
256-
reinterpret_cast<const uint8_t *>(mSrc + y * srcStride),
257-
reinterpret_cast<uint16_t *>(mDst + y * dstStride), width,
258-
attenuateAlpha);
259-
}
254+
concurrency::parallel_for(2, height, [&](int y) {
255+
Rgba8To565HWYRow(reinterpret_cast<const uint8_t *>(mSrc + y * srcStride),
256+
reinterpret_cast<uint16_t *>(mDst + y * dstStride), width,
257+
attenuateAlpha);
258+
});
260259
}
261260

262261
inline Vec<FixedTag<uint8_t, 8>>
@@ -329,9 +328,12 @@ namespace coder::HWY_NAMESPACE {
329328
}
330329

331330
for (; x < width; ++x) {
332-
uint8_t r = static_cast<uint8_t >(roundf(clamp(LoadHalf(src[0]), 0.0f, 1.0f) * maxColors));
333-
uint8_t g = static_cast<uint8_t >(roundf(clamp(LoadHalf(src[1]), 0.0f, 1.0f) * maxColors));
334-
uint8_t b = static_cast<uint8_t >(roundf(clamp(LoadHalf(src[2]), 0.0f, 1.0f) * maxColors));
331+
uint8_t r = static_cast<uint8_t >(roundf(
332+
clamp(LoadHalf(src[0]), 0.0f, 1.0f) * maxColors));
333+
uint8_t g = static_cast<uint8_t >(roundf(
334+
clamp(LoadHalf(src[1]), 0.0f, 1.0f) * maxColors));
335+
uint8_t b = static_cast<uint8_t >(roundf(
336+
clamp(LoadHalf(src[2]), 0.0f, 1.0f) * maxColors));
335337

336338
r = clamp(r, (uint8_t) 0, (uint8_t) maxColors);
337339
g = clamp(g, (uint8_t) 0, (uint8_t) maxColors);
@@ -358,13 +360,11 @@ namespace coder::HWY_NAMESPACE {
358360
auto mSrc = reinterpret_cast<const uint8_t *>(sourceData);
359361
auto mDst = reinterpret_cast<uint8_t *>(dst);
360362

361-
#pragma omp parallel for num_threads(4) schedule(dynamic)
362-
for (int y = 0; y < height; ++y) {
363-
RGBAF16To565RowHWY(
364-
reinterpret_cast<const uint16_t *>(mSrc + y * srcStride),
365-
reinterpret_cast<uint16_t *>(mDst + y * dstStride), width,
366-
maxColors);
367-
}
363+
concurrency::parallel_for(2, height, [&](int y) {
364+
RGBAF16To565RowHWY(reinterpret_cast<const uint16_t *>(mSrc + y * srcStride),
365+
reinterpret_cast<uint16_t *>(mDst + y * dstStride), width,
366+
maxColors);
367+
});
368368
}
369369

370370
void Rgb565ToU8HWY(const uint16_t *sourceData, int srcStride,
@@ -374,13 +374,11 @@ namespace coder::HWY_NAMESPACE {
374374
auto mSrc = reinterpret_cast<const uint8_t *>(sourceData);
375375
auto mDst = reinterpret_cast<uint8_t *>(dst);
376376

377-
#pragma omp parallel for num_threads(4) schedule(dynamic)
378-
for (int y = 0; y < height; ++y) {
379-
Rgb565ToU8HWYRow(
380-
reinterpret_cast<const uint16_t *>(mSrc + srcStride * y),
377+
concurrency::parallel_for(2, height, [&](int y) {
378+
Rgb565ToU8HWYRow(reinterpret_cast<const uint16_t *>(mSrc + srcStride * y),
381379
reinterpret_cast<uint8_t *>(mDst + dstStride * y),
382380
width, permuteMap, bgColor);
383-
}
381+
});
384382
}
385383
}
386384

0 commit comments

Comments
 (0)