diff --git a/src/support/mo_math_utilities.cpp b/src/support/mo_math_utilities.cpp index e17160624338009456ea692555d90c942e7e8b81..b3031a5f575e7fcfbaf54a46a9ddf4ecad0575e1 100644 --- a/src/support/mo_math_utilities.cpp +++ b/src/support/mo_math_utilities.cpp @@ -12,75 +12,69 @@ #include <vector> #include <iostream> #include <chrono> // For timing +#include <Kokkos_Core.hpp> extern "C" { -void tdma_solver_vec(double *a, double *b, double *c, double *d, - int slev, int elev, int startidx, int endidx, - int nrows, int ncols, double *varout, int opt_acc_queue = -1) { - - // Start timing - auto start_time = std::chrono::high_resolution_clock::now(); - - int acc_queue = (opt_acc_queue == -1) ? 1 : opt_acc_queue; // Use 1 as the default if opt_acc_queue is not provided - - double* cp = new double[nrows * ncols]; - double* dp = new double[nrows * ncols]; - - #define IDX(row, col) ((col) * nrows + (row)) // performs better than lambda function - - // OpenACC Parallel Region - #pragma acc parallel default(present) create(cp[:nrows*ncols], dp[:nrows*ncols]) async(acc_queue) - { - // Initialize c-prime and d-prime - #pragma acc loop gang(static: 1) vector - for (int jc = startidx; jc < endidx; ++jc) { - cp[IDX(jc, slev)] = c[IDX(jc, slev)] / b[IDX(jc, slev)]; - dp[IDX(jc, slev)] = d[IDX(jc, slev)] / b[IDX(jc, slev)]; - } - - // Solve for vectors c-prime and d-prime - #pragma acc loop seq - for (int i = slev + 1; i < elev; ++i) { - #pragma acc loop gang(static: 1) vector - for (int jc = startidx; jc < endidx; ++jc) { - double m = 1.0 / (b[IDX(jc, i)] - cp[IDX(jc, i - 1)] * a[IDX(jc, i)]); - cp[IDX(jc, i)] = c[IDX(jc, i)] * m; - dp[IDX(jc, i)] = (d[IDX(jc, i)] - dp[IDX(jc, i - 1)] * a[IDX(jc, i)]) * m; - } - } - - // Initialize varout - #pragma acc loop gang(static: 1) vector - for (int jc = startidx; jc < endidx; ++jc) { - varout[IDX(jc, elev-1)] = dp[IDX(jc, elev-1)]; - } - - // Solve for varout from the vectors c-prime and d-prime - #pragma acc loop seq - for (int i = elev - 2; i >= slev; --i) { - #pragma acc loop gang(static: 1) vector - for (int jc = startidx; jc < endidx; ++jc) { - varout[IDX(jc, i)] = dp[IDX(jc, i)] - cp[IDX(jc, i)] * varout[IDX(jc, i + 1)]; - } - } - } - - printf("tdma_solver_vec: completed using C++\n"); - - // Wait for OpenACC asynchronous operations to complete if acc_queue is not optional - if (opt_acc_queue == -1) { - #pragma acc wait(acc_queue) - } - - // Free memory at the end - delete[] cp; - delete[] dp; - - // End timing - auto end_time = std::chrono::high_resolution_clock::now(); - std::chrono::duration<double> elapsed_time = end_time - start_time; - - std::cout << "Elapsed time for tdma_solver_vec (C++): " << elapsed_time.count() << " seconds" << std::endl; +void tdma_solver_vec_kokkos(const double* a, const double* b, const double* c, const double* d, + int slev, int elev, int startidx, int endidx, + int nrows, int ncols, double* varout) { + + // Start timing + auto start_time = std::chrono::high_resolution_clock::now(); + + // Allocate temporary arrays using Kokkos::View. + // The views c_p and d_p are allocated as 2D arrays with dimensions [nrows][ncols]. + // Kokkos::View automatically handles memory management. + Kokkos::View<double**> c_p("c_p", nrows, ncols); + Kokkos::View<double**> d_p("d_p", nrows, ncols); + + // Wrap the input arrays in unmanaged views. + // We assume that the input arrays are laid out in column-major order as in the original code. + // Here we use LayoutLeft so that the first index (row) is contiguous. + typedef Kokkos::View<const double**, Kokkos::LayoutLeft, Kokkos::MemoryUnmanaged> UnmanagedConst2D; + typedef Kokkos::View<double**, Kokkos::LayoutLeft, Kokkos::MemoryUnmanaged> Unmanaged2D; + UnmanagedConst2D a_view(a, nrows, ncols); + UnmanagedConst2D b_view(b, nrows, ncols); + UnmanagedConst2D c_view(c, nrows, ncols); + UnmanagedConst2D d_view(d, nrows, ncols); + Unmanaged2D varout_view(varout, nrows, ncols); + + // Initialize c-prime and d-prime at the starting level (slev) + Kokkos::parallel_for("init_c_p_d_p", Kokkos::RangePolicy<>(startidx, endidx), KOKKOS_LAMBDA (const int jc) { + c_p(jc, slev) = c_view(jc, slev) / b_view(jc, slev); + d_p(jc, slev) = d_view(jc, slev) / b_view(jc, slev); + }); + Kokkos::fence(); + + // Forward sweep: compute c-prime and d-prime for each column from slev+1 to elev-1. + for (int i = slev + 1; i < elev; ++i) { + Kokkos::parallel_for("forward_sweep", Kokkos::RangePolicy<>(startidx, endidx), KOKKOS_LAMBDA (const int jc) { + double m = 1.0 / (b_view(jc, i) - c_p(jc, i - 1) * a_view(jc, i)); + c_p(jc, i) = c_view(jc, i) * m; + d_p(jc, i) = (d_view(jc, i) - d_p(jc, i - 1) * a_view(jc, i)) * m; + }); + Kokkos::fence(); + } + + // Initialize the output array at the last level (elev-1) + Kokkos::parallel_for("init_varout", Kokkos::RangePolicy<>(startidx, endidx), KOKKOS_LAMBDA (const int jc) { + varout_view(jc, elev-1) = d_p(jc, elev-1); + }); + Kokkos::fence(); + + // Back substitution: update varout for columns from elev-2 down to slev. + for (int i = elev - 2; i >= slev; --i) { + Kokkos::parallel_for("back_substitution", Kokkos::RangePolicy<>(startidx, endidx), KOKKOS_LAMBDA (const int jc) { + varout_view(jc, i) = d_p(jc, i) - c_p(jc, i) * varout_view(jc, i + 1); + }); + Kokkos::fence(); + } + + // End timing and print the elapsed time + auto end_time = std::chrono::high_resolution_clock::now(); + std::chrono::duration<double> elapsed_time = end_time - start_time; + std::cout << "Elapsed time for tdma_solver_vec (Kokkos): " << elapsed_time.count() << " seconds" << std::endl; } + }