Parallelism

awxkee · awxkee · commit a0faefff0352 · 2024-02-19T20:53:53.000Z
diff --git a/app/src/main/java/com/radzivon/bartoshyk/avif/MainActivity.kt b/app/src/main/java/com/radzivon/bartoshyk/avif/MainActivity.kt
@@ -98,7 +98,7 @@ class MainActivity : AppCompatActivity() {
             var allFiles = mutableListOf<String>()
             allFiles.addAll(allFiles2)
             allFiles.addAll(allFiles1)
-            allFiles = allFiles.takeLast(4).toMutableList()
+//            allFiles = allFiles.takeLast(4).toMutableList()
             for (file in allFiles) {
                 try {
                     val buffer = this@MainActivity.assets.open(file).source().buffer()
diff --git a/avif-coder/src/main/cpp/CMakeLists.txt b/avif-coder/src/main/cpp/CMakeLists.txt
@@ -63,7 +63,9 @@ add_definitions(-DCMS_NO_REGISTER_KEYWORD -DSTB_IMAGE_RESIZE_IMPLEMENTATION)
 
 set(CMAKE_ANDROID_API_MIN 24)
 
-target_include_directories(coder PRIVATE ${CMAKE_SOURCE_DIR}/libheif ${CMAKE_SOURCE_DIR} ${CMAKE_SOURCE_DIR}/libyuv ${CMAKE_SOURCE_DIR}/colorspace)
+target_include_directories(coder PRIVATE ${CMAKE_SOURCE_DIR}/libheif
+        ${CMAKE_SOURCE_DIR} ${CMAKE_SOURCE_DIR}/libyuv
+        ${CMAKE_SOURCE_DIR}/colorspace ${CMAKE_SOURCE_DIR}/algo)
 
 find_library( # Sets the name of the path variable.
         log-lib
diff --git a/avif-coder/src/main/cpp/algo/concurrency.hpp b/avif-coder/src/main/cpp/algo/concurrency.hpp
@@ -0,0 +1,105 @@
+#pragma once
+
+#include <functional>
+#include <mutex>
+#include <queue>
+#include <thread>
+#include <vector>
+#include <type_traits>
+
+namespace concurrency {
+
+    template<typename Function>
+    struct function_traits;
+
+    template<typename R, typename... Args>
+    struct function_traits<R(Args...)> {
+        using result_type = R;
+    };
+
+    template<typename Function, typename... Args>
+    void parallel_for(const int numThreads, const int numIterations, Function &&func, Args &&... args) {
+        static_assert(std::is_invocable_v<Function, int, Args...>, "func must take an int parameter for iteration id");
+
+        std::vector<std::thread> threads;
+
+        int segmentHeight = numIterations / numThreads;
+
+        auto parallelWorker = [&](int start, int end) {
+            for (int y = start; y < end; ++y) {
+                {
+                    std::invoke(func, y, std::forward<Args>(args)...);
+                }
+            }
+        };
+
+        if (numThreads > 1) {
+            // Launch N-1 worker threads
+            for (int i = 1; i < numThreads; ++i) {
+                int start = i * segmentHeight;
+                int end = (i + 1) * segmentHeight;
+                if (i == numThreads - 1) {
+                    end = numIterations;
+                }
+                threads.emplace_back(parallelWorker, start, end);
+            }
+        }
+
+        int start = 0;
+        int end = segmentHeight;
+        if (numThreads == 1) {
+            end = numIterations;
+        }
+        parallelWorker(start, end);
+
+        // Join all threads
+        for (auto &thread: threads) {
+            if (thread.joinable()) {
+                thread.join();
+            }
+        }
+    }
+
+    template<typename Function, typename... Args>
+    void parallel_for_with_thread_id(const int numThreads, const int numIterations, Function &&func, Args &&... args) {
+        static_assert(std::is_invocable_v<Function, int, int, Args...>, "func must take an int parameter for threadId, and iteration Id");
+
+        std::vector<std::thread> threads;
+
+        int segmentHeight = numIterations / numThreads;
+
+        auto parallel_worker = [&](int threadId, int start, int end) {
+            for (int y = start; y < end; ++y) {
+                {
+                    std::invoke(func, threadId, y, std::forward<Args>(args)...);
+                }
+            }
+        };
+
+        if (numThreads > 1) {
+            // Launch N-1 worker threads
+            for (int i = 1; i < numThreads; ++i) {
+                int start = i * segmentHeight;
+                int end = (i + 1) * segmentHeight;
+                if (i == numThreads - 1) {
+                    end = numIterations;
+                }
+                threads.emplace_back(parallel_worker, i, start, end);
+            }
+        }
+
+        int start = 0;
+        int end = segmentHeight;
+        if (numThreads == 1) {
+            end = numIterations;
+        }
+        parallel_worker(0, start, end);
+
+        // Join all threads
+        for (auto &thread: threads) {
+            if (thread.joinable()) {
+                thread.join();
+            }
+        }
+    }
+}
diff --git a/avif-coder/src/main/cpp/colorspace/HDRTransferAdapter.cpp b/avif-coder/src/main/cpp/colorspace/HDRTransferAdapter.cpp
@@ -31,6 +31,7 @@
 #include <thread>
 #include "imagebits/half.hpp"
 #include "Eigen/Eigen"
+#include "concurrency.hpp"
 
 using namespace half_float;
 using namespace std;
@@ -878,13 +879,12 @@ namespace coder::HWY_NAMESPACE {
                              Eigen::Matrix3f *conversion,
                              const float gamma,
                              const bool useChromaticAdaptation) {
-#pragma omp parallel for num_threads(7) schedule(dynamic)
-        for (int y = 0; y < height; ++y) {
+        concurrency::parallel_for(6, height, [&](int y) {
             ProcessCPURowHWY(data, y, halfFloats,
                              stride, width, maxColors, gammaCorrection,
                              function, curveToneMapper, conversion, gamma,
                              useChromaticAdaptation);
-        }
+        });
     }
 }
 
diff --git a/avif-coder/src/main/cpp/colorspace/colorspace.cpp b/avif-coder/src/main/cpp/colorspace/colorspace.cpp
@@ -32,6 +32,7 @@
 #include <vector>
 #include <thread>
 #include <android/log.h>
+#include "concurrency.hpp"
 
 using namespace std;
 
@@ -326,21 +327,19 @@ convertUseProfiles(std::vector<uint8_t> &srcVector, int stride,
 
     int threadCount = clamp(min(static_cast<int>(std::thread::hardware_concurrency()),
                                 height * width / (256 * 256)), 1, 12);
-    std::vector<std::thread> workers;
 
     int segmentHeight = height / threadCount;
 
     auto mOutputBuffer = iccARGB.data();
     auto mInputBuffer = srcVector.data();
     auto mTransform = ptrTransform.get();
 
-#pragma omp parallel for num_threads(6) schedule(dynamic)
-    for (int y = 0; y < height; ++y) {
+    concurrency::parallel_for(6, height, [&](int y) {
         cmsDoTransformLineStride(mTransform,
                                  mInputBuffer + stride * y,
                                  mOutputBuffer + dstStride * y, width, 1,
                                  stride, stride, 0, 0);
-    }
+    });
 
     srcVector = iccARGB;
     *newStride = dstStride;
diff --git a/avif-coder/src/main/cpp/imagebits/CopyUnalignedRGBA.cpp b/avif-coder/src/main/cpp/imagebits/CopyUnalignedRGBA.cpp
@@ -30,6 +30,7 @@
 #include <cstdint>
 #include <thread>
 #include <vector>
+#include "concurrency.hpp"
 
 using namespace std;
 
@@ -80,8 +81,7 @@ namespace coder::HWY_NAMESPACE {
                       int width,
                       int height,
                       int pixelSize) {
-    #pragma omp parallel for num_threads(3) schedule(dynamic)
-        for (int y = 0; y < height; ++y) {
+        concurrency::parallel_for(2, height, [&](int y) {
             if (pixelSize == 1) {
                 const ScalableTag<uint8_t> du8;
                 Copy1Row<decltype(du8), uint8_t>(du8,
@@ -112,7 +112,7 @@ namespace coder::HWY_NAMESPACE {
                                                            reinterpret_cast<uint8_t *>(dst) +
                                                            (y * dstStride)), width);
             }
-        }
+        });
     }
 }
 
diff --git a/avif-coder/src/main/cpp/imagebits/RGBAlpha.cpp b/avif-coder/src/main/cpp/imagebits/RGBAlpha.cpp
@@ -27,6 +27,7 @@
  */
 
 #include "RGBAlpha.h"
+#include "concurrency.hpp"
 
 using namespace std;
 
@@ -81,9 +82,7 @@ namespace coder::HWY_NAMESPACE {
     void UnpremultiplyRGBA_HWY(const uint8_t *src, int srcStride,
                                uint8_t *dst, int dstStride, int width,
                                int height) {
-#pragma omp parallel for num_threads(4) schedule(dynamic)
-        for (int y = 0; y < height; ++y) {
-
+        concurrency::parallel_for(2, height, [&]( int y) {
             const FixedTag<uint8_t, 16> du8x16;
             const FixedTag<uint16_t, 8> du16x8;
             const FixedTag<uint8_t, 8> du8x8;
@@ -146,7 +145,7 @@ namespace coder::HWY_NAMESPACE {
                 mSrc += 4;
                 mDst += 4;
             }
-        }
+        });
     }
 
     template<typename D, typename I = Vec<D>>
@@ -192,9 +191,7 @@ namespace coder::HWY_NAMESPACE {
     void PremultiplyRGBA_HWY(const uint8_t *src, int srcStride,
                              uint8_t *dst, int dstStride, int width,
                              int height) {
-#pragma omp parallel for num_threads(4) schedule(dynamic)
-        for (int y = 0; y < height; ++y) {
-
+        concurrency::parallel_for(2, height, [&](int y) {
             const FixedTag<uint8_t, 16> du8x16;
             const FixedTag<uint16_t, 8> du16x8;
             const FixedTag<uint8_t, 8> du8x8;
@@ -233,7 +230,7 @@ namespace coder::HWY_NAMESPACE {
                 mSrc += 4;
                 mDst += 4;
             }
-        }
+        });
     }
 }
 
diff --git a/avif-coder/src/main/cpp/imagebits/Rgb1010102.cpp b/avif-coder/src/main/cpp/imagebits/Rgb1010102.cpp
@@ -31,6 +31,7 @@
 #include <thread>
 #include <algorithm>
 #include "half.hpp"
+#include "concurrency.hpp"
 
 using namespace std;
 
@@ -372,16 +373,11 @@ namespace coder::HWY_NAMESPACE {
         auto src = reinterpret_cast<const uint8_t *>(source);
         auto dst = reinterpret_cast<uint8_t *>(destination);
 
-#pragma omp parallel for num_threads(4) schedule(dynamic)
-        for (int y = 0; y < height; ++y) {
-            Rgba8ToRGBA1010102HWYRow(reinterpret_cast<const uint8_t *>(src +
-                                                                       srcStride *
-                                                                       y),
-                                     reinterpret_cast<uint32_t *>(dst +
-                                                                  dstStride *
-                                                                  y),
+        concurrency::parallel_for(2, height, [&](int y) {
+            Rgba8ToRGBA1010102HWYRow(reinterpret_cast<const uint8_t *>(src + srcStride * y),
+                                     reinterpret_cast<uint32_t *>(dst + dstStride * y),
                                      width, &permuteMap[0], attenuateAlpha);
-        }
+        });
     }
 
     void
@@ -395,15 +391,11 @@ namespace coder::HWY_NAMESPACE {
         auto src = reinterpret_cast<const uint8_t *>(source);
         auto dst = reinterpret_cast<uint8_t *>(destination);
 
-#pragma omp parallel for num_threads(4) schedule(dynamic)
-        for (int y = 0; y < height; ++y) {
-            F16ToRGBA1010102HWYRow(reinterpret_cast<const uint16_t *>(src +
-                                                                      srcStride *
-                                                                      y),
-                                   reinterpret_cast<uint32_t *>(dst +
-                                                                dstStride * y),
+        concurrency::parallel_for(2, height, [&](int y) {
+            F16ToRGBA1010102HWYRow(reinterpret_cast<const uint16_t *>(src + srcStride * y),
+                                   reinterpret_cast<uint32_t *>(dst + dstStride * y),
                                    width, &permuteMap[0]);
-        }
+        });
     }
 
 // NOLINTNEXTLINE(google-readability-namespace-comments)
diff --git a/avif-coder/src/main/cpp/imagebits/Rgb565.cpp b/avif-coder/src/main/cpp/imagebits/Rgb565.cpp
@@ -31,6 +31,7 @@
 #include <vector>
 #include "half.hpp"
 #include <algorithm>
+#include "concurrency.hpp"
 
 using namespace std;
 
@@ -250,13 +251,11 @@ namespace coder::HWY_NAMESPACE {
         auto mSrc = reinterpret_cast<const uint8_t *>(sourceData);
         auto mDst = reinterpret_cast<uint8_t *>(dst);
 
-#pragma omp parallel for num_threads(4) schedule(dynamic)
-        for (int y = 0; y < height; ++y) {
-            Rgba8To565HWYRow(
-                    reinterpret_cast<const uint8_t *>(mSrc + y * srcStride),
-                    reinterpret_cast<uint16_t *>(mDst + y * dstStride), width,
-                    attenuateAlpha);
-        }
+        concurrency::parallel_for(2, height, [&](int y) {
+            Rgba8To565HWYRow(reinterpret_cast<const uint8_t *>(mSrc + y * srcStride),
+                             reinterpret_cast<uint16_t *>(mDst + y * dstStride), width,
+                             attenuateAlpha);
+        });
     }
 
     inline Vec<FixedTag<uint8_t, 8>>
@@ -329,9 +328,12 @@ namespace coder::HWY_NAMESPACE {
         }
 
         for (; x < width; ++x) {
-            uint8_t r = static_cast<uint8_t >(roundf(clamp(LoadHalf(src[0]), 0.0f, 1.0f) * maxColors));
-            uint8_t g = static_cast<uint8_t >(roundf(clamp(LoadHalf(src[1]), 0.0f, 1.0f) * maxColors));
-            uint8_t b = static_cast<uint8_t >(roundf(clamp(LoadHalf(src[2]), 0.0f, 1.0f) * maxColors));
+            uint8_t r = static_cast<uint8_t >(roundf(
+                    clamp(LoadHalf(src[0]), 0.0f, 1.0f) * maxColors));
+            uint8_t g = static_cast<uint8_t >(roundf(
+                    clamp(LoadHalf(src[1]), 0.0f, 1.0f) * maxColors));
+            uint8_t b = static_cast<uint8_t >(roundf(
+                    clamp(LoadHalf(src[2]), 0.0f, 1.0f) * maxColors));
 
             r = clamp(r, (uint8_t) 0, (uint8_t) maxColors);
             g = clamp(g, (uint8_t) 0, (uint8_t) maxColors);
@@ -358,13 +360,11 @@ namespace coder::HWY_NAMESPACE {
         auto mSrc = reinterpret_cast<const uint8_t *>(sourceData);
         auto mDst = reinterpret_cast<uint8_t *>(dst);
 
-#pragma omp parallel for num_threads(4) schedule(dynamic)
-        for (int y = 0; y < height; ++y) {
-            RGBAF16To565RowHWY(
-                    reinterpret_cast<const uint16_t *>(mSrc + y * srcStride),
-                    reinterpret_cast<uint16_t *>(mDst + y * dstStride), width,
-                    maxColors);
-        }
+        concurrency::parallel_for(2, height, [&](int y) {
+            RGBAF16To565RowHWY(reinterpret_cast<const uint16_t *>(mSrc + y * srcStride),
+                               reinterpret_cast<uint16_t *>(mDst + y * dstStride), width,
+                               maxColors);
+        });
     }
 
     void Rgb565ToU8HWY(const uint16_t *sourceData, int srcStride,
@@ -374,13 +374,11 @@ namespace coder::HWY_NAMESPACE {
         auto mSrc = reinterpret_cast<const uint8_t *>(sourceData);
         auto mDst = reinterpret_cast<uint8_t *>(dst);
 
-#pragma omp parallel for num_threads(4) schedule(dynamic)
-        for (int y = 0; y < height; ++y) {
-            Rgb565ToU8HWYRow(
-                    reinterpret_cast<const uint16_t *>(mSrc + srcStride * y),
+        concurrency::parallel_for(2, height, [&](int y) {
+            Rgb565ToU8HWYRow(reinterpret_cast<const uint16_t *>(mSrc + srcStride * y),
                     reinterpret_cast<uint8_t *>(mDst + dstStride * y),
                     width, permuteMap, bgColor);
-        }
+        });
     }
 }
 
diff --git a/avif-coder/src/main/cpp/imagebits/Rgba8ToF16.cpp b/avif-coder/src/main/cpp/imagebits/Rgba8ToF16.cpp
diff --git a/avif-coder/src/main/cpp/imagebits/RgbaF16bitNBitU8.cpp b/avif-coder/src/main/cpp/imagebits/RgbaF16bitNBitU8.cpp
diff --git a/avif-coder/src/main/cpp/imagebits/RgbaF16bitToNBitU16.cpp b/avif-coder/src/main/cpp/imagebits/RgbaF16bitToNBitU16.cpp
diff --git a/avif-coder/src/main/cpp/imagebits/RgbaU16toHF.cpp b/avif-coder/src/main/cpp/imagebits/RgbaU16toHF.cpp