Skip to content
Snippets Groups Projects
Commit 64d639c4 authored by Pradipta Samanta's avatar Pradipta Samanta
Browse files

converted the c++ code in mo_math_utilities to Kokkos

parent d2ee94aa
No related branches found
No related tags found
1 merge request!37Draft: C++ port of horizontal/mo_lib_gradients.F90
......@@ -12,75 +12,69 @@
#include <vector>
#include <iostream>
#include <chrono> // For timing
#include <Kokkos_Core.hpp>
extern "C" {
void tdma_solver_vec(double *a, double *b, double *c, double *d,
int slev, int elev, int startidx, int endidx,
int nrows, int ncols, double *varout, int opt_acc_queue = -1) {
// Start timing
auto start_time = std::chrono::high_resolution_clock::now();
int acc_queue = (opt_acc_queue == -1) ? 1 : opt_acc_queue; // Use 1 as the default if opt_acc_queue is not provided
double* cp = new double[nrows * ncols];
double* dp = new double[nrows * ncols];
#define IDX(row, col) ((col) * nrows + (row)) // performs better than lambda function
// OpenACC Parallel Region
#pragma acc parallel default(present) create(cp[:nrows*ncols], dp[:nrows*ncols]) async(acc_queue)
{
// Initialize c-prime and d-prime
#pragma acc loop gang(static: 1) vector
for (int jc = startidx; jc < endidx; ++jc) {
cp[IDX(jc, slev)] = c[IDX(jc, slev)] / b[IDX(jc, slev)];
dp[IDX(jc, slev)] = d[IDX(jc, slev)] / b[IDX(jc, slev)];
}
// Solve for vectors c-prime and d-prime
#pragma acc loop seq
for (int i = slev + 1; i < elev; ++i) {
#pragma acc loop gang(static: 1) vector
for (int jc = startidx; jc < endidx; ++jc) {
double m = 1.0 / (b[IDX(jc, i)] - cp[IDX(jc, i - 1)] * a[IDX(jc, i)]);
cp[IDX(jc, i)] = c[IDX(jc, i)] * m;
dp[IDX(jc, i)] = (d[IDX(jc, i)] - dp[IDX(jc, i - 1)] * a[IDX(jc, i)]) * m;
}
}
// Initialize varout
#pragma acc loop gang(static: 1) vector
for (int jc = startidx; jc < endidx; ++jc) {
varout[IDX(jc, elev-1)] = dp[IDX(jc, elev-1)];
}
// Solve for varout from the vectors c-prime and d-prime
#pragma acc loop seq
for (int i = elev - 2; i >= slev; --i) {
#pragma acc loop gang(static: 1) vector
for (int jc = startidx; jc < endidx; ++jc) {
varout[IDX(jc, i)] = dp[IDX(jc, i)] - cp[IDX(jc, i)] * varout[IDX(jc, i + 1)];
}
}
}
printf("tdma_solver_vec: completed using C++\n");
// Wait for OpenACC asynchronous operations to complete if acc_queue is not optional
if (opt_acc_queue == -1) {
#pragma acc wait(acc_queue)
}
// Free memory at the end
delete[] cp;
delete[] dp;
// End timing
auto end_time = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed_time = end_time - start_time;
std::cout << "Elapsed time for tdma_solver_vec (C++): " << elapsed_time.count() << " seconds" << std::endl;
void tdma_solver_vec_kokkos(const double* a, const double* b, const double* c, const double* d,
int slev, int elev, int startidx, int endidx,
int nrows, int ncols, double* varout) {
// Start timing
auto start_time = std::chrono::high_resolution_clock::now();
// Allocate temporary arrays using Kokkos::View.
// The views c_p and d_p are allocated as 2D arrays with dimensions [nrows][ncols].
// Kokkos::View automatically handles memory management.
Kokkos::View<double**> c_p("c_p", nrows, ncols);
Kokkos::View<double**> d_p("d_p", nrows, ncols);
// Wrap the input arrays in unmanaged views.
// We assume that the input arrays are laid out in column-major order as in the original code.
// Here we use LayoutLeft so that the first index (row) is contiguous.
typedef Kokkos::View<const double**, Kokkos::LayoutLeft, Kokkos::MemoryUnmanaged> UnmanagedConst2D;
typedef Kokkos::View<double**, Kokkos::LayoutLeft, Kokkos::MemoryUnmanaged> Unmanaged2D;
UnmanagedConst2D a_view(a, nrows, ncols);
UnmanagedConst2D b_view(b, nrows, ncols);
UnmanagedConst2D c_view(c, nrows, ncols);
UnmanagedConst2D d_view(d, nrows, ncols);
Unmanaged2D varout_view(varout, nrows, ncols);
// Initialize c-prime and d-prime at the starting level (slev)
Kokkos::parallel_for("init_c_p_d_p", Kokkos::RangePolicy<>(startidx, endidx), KOKKOS_LAMBDA (const int jc) {
c_p(jc, slev) = c_view(jc, slev) / b_view(jc, slev);
d_p(jc, slev) = d_view(jc, slev) / b_view(jc, slev);
});
Kokkos::fence();
// Forward sweep: compute c-prime and d-prime for each column from slev+1 to elev-1.
for (int i = slev + 1; i < elev; ++i) {
Kokkos::parallel_for("forward_sweep", Kokkos::RangePolicy<>(startidx, endidx), KOKKOS_LAMBDA (const int jc) {
double m = 1.0 / (b_view(jc, i) - c_p(jc, i - 1) * a_view(jc, i));
c_p(jc, i) = c_view(jc, i) * m;
d_p(jc, i) = (d_view(jc, i) - d_p(jc, i - 1) * a_view(jc, i)) * m;
});
Kokkos::fence();
}
// Initialize the output array at the last level (elev-1)
Kokkos::parallel_for("init_varout", Kokkos::RangePolicy<>(startidx, endidx), KOKKOS_LAMBDA (const int jc) {
varout_view(jc, elev-1) = d_p(jc, elev-1);
});
Kokkos::fence();
// Back substitution: update varout for columns from elev-2 down to slev.
for (int i = elev - 2; i >= slev; --i) {
Kokkos::parallel_for("back_substitution", Kokkos::RangePolicy<>(startidx, endidx), KOKKOS_LAMBDA (const int jc) {
varout_view(jc, i) = d_p(jc, i) - c_p(jc, i) * varout_view(jc, i + 1);
});
Kokkos::fence();
}
// End timing and print the elapsed time
auto end_time = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed_time = end_time - start_time;
std::cout << "Elapsed time for tdma_solver_vec (Kokkos): " << elapsed_time.count() << " seconds" << std::endl;
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment