OptimalSlopeParallelisation/OptimalSlopeTechnicalTask.cpp at main · MikeO89/OptimalSlopeParallelisation · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
/**
 * @file OptimalSlopeTechnicalTask.cpp
 * @brief Compares the performance of parallel and sequential matrix computations using OpenMP as a technical task for Optimal Slope.
 * @details This program benchmarks two implementations (parallel and sequential) of a matrix computation
 *          involving a modified Taylor Series Expansion. The results are reported in nanoseconds and
 *          include the execution time disparity between the two approaches.
 * @author Mike Orr
 * @date Sun 08 Dec 2024
 */


#include <iostream>
#include <omp.h>
#include <chrono>
#include <vector>
#include <string>
#include <thread>

constexpr long long NS_IN_MICROSECOND = 1000;
constexpr long long NS_IN_MILLISECOND = 1000000;
constexpr long long NS_IN_SECOND = 1000000000;

/**
* @brief Creates a 2D array (a "box") of size `size` x `size` (square matrix).
* @param[in] size The size of the array (number of rows and columns).
* @return A 2D vector (a box array) of size `size` x `size` initialized with all values defaulted to 0.
*/

inline std::vector<std::vector<double>> createBoxArray(int size)
{
    return std::vector<std::vector<double>>(size, std::vector<double>(size));
}

/**
* @brief Times the execution of a computation function on a 2D array and outputs the elapsed time.
* @param[in] prefixOutput A string to prepend to the timing result to identify which computation is being timed.
* @param[in] compute A function (or lambda) to compute the operation on the 2D array.
* @param[in] rowColCount The size (rows and columns) of the 2D array.
* @throws std::exception If an error occurs during computation or memory allocation.
* @return The elapsed time for the computation, measured in nanoseconds.
*/

template <typename F>
long long processArray(const std::string &prefixOutput, F compute, unsigned int rowColCount)
{
    std::chrono::time_point<std::chrono::high_resolution_clock> startTime, endTime;
    long long elapsedTimeNS = 0;

    try
    {
        std::vector<std::vector<double>> result = std::move(createBoxArray(rowColCount));

        startTime = std::chrono::high_resolution_clock::now();
        compute(result);
        endTime = std::chrono::high_resolution_clock::now();

        elapsedTimeNS = std::chrono::duration_cast<std::chrono::nanoseconds>(endTime - startTime).count();

        std::cout << prefixOutput << elapsedTimeNS << "ns" << std::endl;
    }
    catch (const std::bad_alloc &e)
    {
        std::cerr << "Memory allocation failed: " << e.what() << std::endl;
    }
    catch (const std::exception &e)
    {
        std::cerr << "Error in " << __FUNCSIG__ << ": " << e.what() << std::endl;
    }

    return elapsedTimeNS;
}

/**
* @brief Computes a modified Taylor Series Expansion for the given parameters.
*
* @details This function computes a modified version of the Taylor Series Expansion for the
* expression \((a + b)^c\), where \(a\), \(b\), and \(c\) are input parameters.
* The Taylor series is used to approximate functions and can be employed in various
* numerical methods. In this case, the series is truncated to a degree of \(c\),
* using precomputed factorial values for efficient calculation.
* The result is computed as the sum of the series terms, with each term being
* \(\frac{(a + b)^c}{c!}\), where \(c!\) is the factorial of \(c\).
*
* @param[in] a The first parameter for the series, which is added to the second parameter \(b\).
* @param[in] b The second parameter for the series, which is added to the first parameter \(a\).
* @param[in] c The degree of the series expansion (also used to index the factorial).
*             The higher the value of \(c\), the more terms are included in the expansion.
* @return The result of the Taylor Series Expansion for the given parameters. This is the
*         approximation of \((a + b)^c\) using a truncated series expansion.
*/

inline double computeTaylorSeriesExapansion(int a, int b, int c)
{
    static double factorial[10] = {1, 1, 2, 6, 24, 120, 720, 5040, 40320, 362880}; // Compile-time computed

    if(c < 0 || c > 9)
        throw std::invalid_argument("c should be in the range of 0 - 9.");

    return std::pow(a + b, c) / factorial[c]; // factorial[c] == std::tgamma(c + 1);
}

/**
* @brief Computes the Taylor Series Expansion values in parallel across a 2D array and measures the execution time.
* @param[in] rowColCount The size (rows and columns) of the 2D array.
* @details Parallelizes the outer two loops (`i` and `j`) using OpenMP.
*          Each thread processes a subset of the 2D array independently.
* @note Uses `#pragma omp parallel for collapse(2)` to collapse the two loops for better load balancing.
* @return The elapsed time of the parallel computation in nanoseconds.
*/

long long showParallelLoops(int rowColCount)
{
    return processArray("Execution time (Parallel): ", [rowColCount](std::vector<std::vector<double>> &result) -> void
    {
        #pragma omp parallel for collapse(2) shared(result) schedule(dynamic)
        for(int i = 0; i < rowColCount; i++)
        {
            for(int j = 0; j < rowColCount; j++)
            {
                double sum = 0.0;

                for (int k = 0; k < 10; k++)
                    sum += computeTaylorSeriesExapansion(i,j,k);

                result[i][j] = sum;
            }
        }
    }, rowColCount);
}

/**
* @brief Computes the Taylor Series Expansion values sequentially across a 2D array and measures the execution time.
* @param[in] rowColCount: The size (rows and columns) of the 2D array.
 * @return The elapsed time of the sequential computation in nanoseconds.
*/

long long showNonParallelLoops(int rowColCount)
{
    return processArray("Execution time (Non-parallel): ", [rowColCount](std::vector<std::vector<double>> &result) -> void
    {
        for (int i = 0; i < rowColCount; i++)
        {
            for (int j = 0; j < rowColCount; j++)
            {
                for (int k = 0; k < 10; k++)
                    result[i][j] += computeTaylorSeriesExapansion(i,j,k);
            }
        }
    }, rowColCount);
}

/**
* @brief Application entry point that runs the parallel and non-parallel computations.
* @param[in] argC: Number of arguments passed to the application.
* @param[in] argV A list of arguments passed to the application.
*            If provided, `argV[1]` is expected to be an integer specifying the size of the matrix.
* @details Limits the matrix size (`rowColCount`) to a maximum of 10,000 to prevent excessive memory usage.
* @return Exit code. 0 is success, non-zero indicates failure.
*/

int main(int argC, char **argV)
{
    int threadCount = 0;
    unsigned int rowColCount = 3000; // Default value
    long long parallelExecTimeNS = 0, nonParallelExecTimeNS = 0, execTimeDisparityNS = 0;

    if(argC > 1)
        rowColCount = std::stoi(argV[1]); // Should a row/col count be provided, then overwrite the default value

    try
    {
        if(!rowColCount || rowColCount > 10000)
            throw std::invalid_argument("Row/column count must be between 1 - 10000.");

        // Estimate the number of threads the system can handle concurrently based on the available hardware
        threadCount = std::thread::hardware_concurrency();

        if(!threadCount)
            threadCount = 2; // Default to two threads if unable to determine the number of threads

        // Set the omp thread count to the determined value
        omp_set_num_threads(threadCount);

        parallelExecTimeNS = showParallelLoops(static_cast<int>(rowColCount));
        nonParallelExecTimeNS = showNonParallelLoops(static_cast<int>(rowColCount));
        execTimeDisparityNS = std::abs(nonParallelExecTimeNS - parallelExecTimeNS);

        std::cout << "Matrix size: " << rowColCount << "x" << rowColCount << std::endl
            << "Thread count: " << threadCount << std::endl << std::endl
            << "Execution disparity: " << execTimeDisparityNS << "ns, "
            << execTimeDisparityNS / NS_IN_MICROSECOND << "us, "
            << execTimeDisparityNS / NS_IN_MILLISECOND << "ms, "
            << execTimeDisparityNS / NS_IN_SECOND << "s" << std::endl;
    }
    catch(const std::exception &e)
    {
        std::cerr << "Program execution failed: " << e.what() << std::endl;
        return EXIT_FAILURE;
    }

    return EXIT_SUCCESS;
}