diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 9837066c4e73121489509fc82a07ff6e5c72eb59..5109bb5083a03c8c118f84718daf5026ff1923d7 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -17,7 +17,7 @@ stages: variables: ACCOUNT_CPU: "ka1125" ACCOUNT_GPU: "bk1341" - SLURM_OPTIONS_CPU: "--account=$ACCOUNT_CPU --partition=shared" + SLURM_OPTIONS_CPU: "--account=$ACCOUNT_CPU --partition=shared --time=00:10:00" SLURM_OPTIONS_GPU: "--account=$ACCOUNT_GPU --partition=gpu --gpus=1" SLURM_NTASKS: "--ntasks=1" GIT_CONFIG_COUNT: 1 @@ -94,7 +94,8 @@ nvhpc_gpu: - module load git gcc/11.2.0-gcc-11.2.0 nvhpc/24.7-gcc-11.2.0 - mkdir nvhpc_gpu - cd nvhpc_gpu - - /sw/spack-levante/cmake-3.23.1-q5kzz6/bin/cmake .. -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_C_COMPILER=nvc -DCMAKE_Fortran_COMPILER=nvfortran -DIM_ENABLE_OPENACC=ON + - export LD_LIBRARY_PATH=/sw/spack-levante/gcc-11.2.0-bcn7mb/lib64:$LD_LIBRARY_PATH + - /sw/spack-levante/cmake-3.23.1-q5kzz6/bin/cmake .. -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_C_COMPILER=nvc -DCMAKE_Fortran_COMPILER=nvfortran -DIM_ENABLE_GPU=nvidia-sm80 -DIM_ENABLE_OPENACC=ON - make VERBOSE=1 - make test tags: diff --git a/src/horizontal/mo_lib_divrot.cpp b/src/horizontal/mo_lib_divrot.cpp index d086e8ba4d7910986117e912fa32cdc4f4426ac5..0138cc0408c6ee068a73bafed2d7001bb51eaae9 100644 --- a/src/horizontal/mo_lib_divrot.cpp +++ b/src/horizontal/mo_lib_divrot.cpp @@ -36,9 +36,6 @@ void recon_lsq_cell_l(const T *p_cc, const int *cell_neighbor_idx, Kokkos::MemoryUnmanaged> UnmanagedConstInt3D; - Kokkos::View<T *> z_d("z_d", lsq_dim_c); - Kokkos::View<T *> z_qt_times_d("z_qt_times_d", lsq_dim_unk); - UnmanagedConstInt3D iidx(cell_neighbor_idx, nproma, nblks_c, lsq_dim_c); UnmanagedConstInt3D iblk(cell_neighbor_blk, nproma, nblks_c, lsq_dim_c); @@ -54,35 +51,38 @@ void recon_lsq_cell_l(const T *p_cc, const int *cell_neighbor_idx, nblks_c); UnmanagedConstT3D lsq_moments_view(lsq_moments, nproma, nblks_c, lsq_dim_unk); - for (int jb = i_startblk; jb < i_endblk; ++jb) { + for (int jb = i_startblk; jb <= i_endblk; ++jb) { int i_startidx, i_endidx; get_indices_c_lib(i_startidx_in, i_endidx_in, nproma, jb, i_startblk, i_endblk, i_startidx, i_endidx); Kokkos::MDRangePolicy<Kokkos::Rank<2>> innerPolicy({slev, i_startidx}, - {elev, i_endidx}); + {elev + 1, i_endidx + 1}); Kokkos::parallel_for( "recon_lsq_cell_l_inner", innerPolicy, KOKKOS_LAMBDA(const int jk, const int jc) { - z_d(0) = p_cc_view(iidx(jc, jb, 0), jk, iblk(jc, jb, 0)) - + T z_d[3]; // Local array instead of shared View + T z_qt_times_d[2]; + + z_d[0] = p_cc_view(iidx(jc, jb, 0), jk, iblk(jc, jb, 0)) - p_cc_view(jc, jk, jb); - z_d(1) = p_cc_view(iidx(jc, jb, 1), jk, iblk(jc, jb, 1)) - + z_d[1] = p_cc_view(iidx(jc, jb, 1), jk, iblk(jc, jb, 1)) - p_cc_view(jc, jk, jb); - z_d(2) = p_cc_view(iidx(jc, jb, 2), jk, iblk(jc, jb, 2)) - + z_d[2] = p_cc_view(iidx(jc, jb, 2), jk, iblk(jc, jb, 2)) - p_cc_view(jc, jk, jb); // matrix multiplication Q^T d (partitioned into 2 dot products) - z_qt_times_d(0) = lsq_qtmat_c_view(jc, 0, 0, jb) * z_d(0) + - lsq_qtmat_c_view(jc, 0, 1, jb) * z_d(1) + - lsq_qtmat_c_view(jc, 0, 2, jb) * z_d(2); - z_qt_times_d(1) = lsq_qtmat_c_view(jc, 1, 0, jb) * z_d(0) + - lsq_qtmat_c_view(jc, 1, 1, jb) * z_d(1) + - lsq_qtmat_c_view(jc, 1, 2, jb) * z_d(2); + z_qt_times_d[0] = lsq_qtmat_c_view(jc, 0, 0, jb) * z_d[0] + + lsq_qtmat_c_view(jc, 0, 1, jb) * z_d[1] + + lsq_qtmat_c_view(jc, 0, 2, jb) * z_d[2]; + z_qt_times_d[1] = lsq_qtmat_c_view(jc, 1, 0, jb) * z_d[0] + + lsq_qtmat_c_view(jc, 1, 1, jb) * z_d[1] + + lsq_qtmat_c_view(jc, 1, 2, jb) * z_d[2]; p_coeff_view(2, jc, jk, jb) = - lsq_rmat_rdiag_c_view(jc, 1, jb) * z_qt_times_d(1); + lsq_rmat_rdiag_c_view(jc, 1, jb) * z_qt_times_d[1]; p_coeff_view(1, jc, jk, jb) = lsq_rmat_rdiag_c_view(jc, 0, jb) * - (z_qt_times_d(0) - + (z_qt_times_d[0] - lsq_rmat_utri_c_view(jc, 0, jb) * p_coeff_view(2, jc, jk, jb)); p_coeff_view(0, jc, jk, jb) = p_cc_view(jc, jk, jb); }); @@ -124,8 +124,6 @@ void recon_lsq_cell_l_svd(const T *p_cc, const int *cell_neighbor_idx, Kokkos::MemoryUnmanaged> UnmanagedConstInt3D; - Kokkos::View<T *> z_b("z_b", lsq_dim_c); - UnmanagedConstInt3D iidx(cell_neighbor_idx, nproma, nblks_c, lsq_dim_c); UnmanagedConstInt3D iblk(cell_neighbor_blk, nproma, nblks_c, lsq_dim_c); @@ -136,31 +134,32 @@ void recon_lsq_cell_l_svd(const T *p_cc, const int *cell_neighbor_idx, lsq_dim_c, nblks_c); UnmanagedConstT3D lsq_moments_view(lsq_moments, nproma, nblks_c, lsq_dim_unk); - for (int jb = i_startblk; jb < i_endblk; ++jb) { + for (int jb = i_startblk; jb <= i_endblk; ++jb) { int i_startidx, i_endidx; get_indices_c_lib(i_startidx_in, i_endidx_in, nproma, jb, i_startblk, i_endblk, i_startidx, i_endidx); Kokkos::MDRangePolicy<Kokkos::Rank<2>> innerPolicy({slev, i_startidx}, - {elev, i_endidx}); + {elev + 1, i_endidx + 1}); Kokkos::parallel_for( "recon_lsq_cell_l_svd_inner", innerPolicy, KOKKOS_LAMBDA(const int jk, const int jc) { - z_b(0) = p_cc_view(iidx(jc, jb, 0), jk, iblk(jc, jb, 0)) - + T z_b[3]; // Local array instead of shared View + z_b[0] = p_cc_view(iidx(jc, jb, 0), jk, iblk(jc, jb, 0)) - p_cc_view(jc, jk, jb); - z_b(1) = p_cc_view(iidx(jc, jb, 1), jk, iblk(jc, jb, 1)) - + z_b[1] = p_cc_view(iidx(jc, jb, 1), jk, iblk(jc, jb, 1)) - p_cc_view(jc, jk, jb); - z_b(2) = p_cc_view(iidx(jc, jb, 2), jk, iblk(jc, jb, 2)) - + z_b[2] = p_cc_view(iidx(jc, jb, 2), jk, iblk(jc, jb, 2)) - p_cc_view(jc, jk, jb); p_coeff_view(2, jc, jk, jb) = - lsq_pseudoinv_view(jc, 1, 0, jb) * z_b(0) + - lsq_pseudoinv_view(jc, 1, 1, jb) * z_b(1) + - lsq_pseudoinv_view(jc, 1, 2, jb) * z_b(2); + lsq_pseudoinv_view(jc, 1, 0, jb) * z_b[0] + + lsq_pseudoinv_view(jc, 1, 1, jb) * z_b[1] + + lsq_pseudoinv_view(jc, 1, 2, jb) * z_b[2]; p_coeff_view(1, jc, jk, jb) = - lsq_pseudoinv_view(jc, 0, 0, jb) * z_b(0) + - lsq_pseudoinv_view(jc, 0, 1, jb) * z_b(1) + - lsq_pseudoinv_view(jc, 0, 2, jb) * z_b(2); + lsq_pseudoinv_view(jc, 0, 0, jb) * z_b[0] + + lsq_pseudoinv_view(jc, 0, 1, jb) * z_b[1] + + lsq_pseudoinv_view(jc, 0, 2, jb) * z_b[2]; p_coeff_view(0, jc, jk, jb) = p_cc_view(jc, jk, jb); }); if (l_consv) { @@ -201,9 +200,6 @@ void recon_lsq_cell_q(const T *p_cc, const int *lsq_idx_c, const int *lsq_blk_c, Kokkos::MemoryUnmanaged> UnmanagedConstInt3D; - Kokkos::View<T ***> z_d("z_d", lsq_dim_c, nproma, nlev); - Kokkos::View<T *> z_qt_times_d("z_qt_times_d", lsq_dim_unk); - UnmanagedConstInt3D iidx(lsq_idx_c, nproma, nblks_c, lsq_dim_c); UnmanagedConstInt3D iblk(lsq_blk_c, nproma, nblks_c, lsq_dim_c); @@ -221,7 +217,7 @@ void recon_lsq_cell_q(const T *p_cc, const int *lsq_idx_c, const int *lsq_blk_c, if (patch_id > 0 || l_limited_area) { Kokkos::MDRangePolicy<Kokkos::Rank<4>> initPolicy( {0, i_startidx_in, slev, i_startblk}, - {lsq_dim_unk + 1, i_endidx_in, elev, i_endblk}); + {lsq_dim_unk + 1, i_endidx_in + 1, elev + 1, i_endblk + 1}); Kokkos::parallel_for( "recon_lsq_cell_q_init", initPolicy, KOKKOS_LAMBDA(const int ji, const int jc, const int jk, const int jb) { @@ -229,103 +225,102 @@ void recon_lsq_cell_q(const T *p_cc, const int *lsq_idx_c, const int *lsq_blk_c, }); } - for (int jb = i_startblk; jb < i_endblk; ++jb) { + for (int jb = i_startblk; jb <= i_endblk; ++jb) { int i_startidx, i_endidx; get_indices_c_lib(i_startidx_in, i_endidx_in, nproma, jb, i_startblk, i_endblk, i_startidx, i_endidx); Kokkos::MDRangePolicy<Kokkos::Rank<2>> innerPolicy({slev, i_startidx}, - {elev, i_endidx}); + {elev + 1, i_endidx + 1}); Kokkos::parallel_for( "recon_lsq_cell_q_step1", innerPolicy, KOKKOS_LAMBDA(const int jk, const int jc) { - z_d(0, jc, jk) = p_cc_view(iidx(jc, jb, 0), jk, iblk(jc, jb, 0)) - - p_cc_view(jc, jk, jb); - z_d(1, jc, jk) = p_cc_view(iidx(jc, jb, 1), jk, iblk(jc, jb, 1)) - - p_cc_view(jc, jk, jb); - z_d(2, jc, jk) = p_cc_view(iidx(jc, jb, 2), jk, iblk(jc, jb, 2)) - - p_cc_view(jc, jk, jb); - z_d(3, jc, jk) = p_cc_view(iidx(jc, jb, 3), jk, iblk(jc, jb, 3)) - - p_cc_view(jc, jk, jb); - z_d(4, jc, jk) = p_cc_view(iidx(jc, jb, 4), jk, iblk(jc, jb, 4)) - - p_cc_view(jc, jk, jb); - z_d(5, jc, jk) = p_cc_view(iidx(jc, jb, 5), jk, iblk(jc, jb, 5)) - - p_cc_view(jc, jk, jb); - z_d(6, jc, jk) = p_cc_view(iidx(jc, jb, 6), jk, iblk(jc, jb, 6)) - - p_cc_view(jc, jk, jb); - z_d(7, jc, jk) = p_cc_view(iidx(jc, jb, 7), jk, iblk(jc, jb, 7)) - - p_cc_view(jc, jk, jb); - z_d(8, jc, jk) = p_cc_view(iidx(jc, jb, 8), jk, iblk(jc, jb, 8)) - + T z_d[9]; // Local array instead of shared View + T z_qt_times_d[5]; + z_d[0] = p_cc_view(iidx(jc, jb, 0), jk, iblk(jc, jb, 0)) - + p_cc_view(jc, jk, jb); + z_d[1] = p_cc_view(iidx(jc, jb, 1), jk, iblk(jc, jb, 1)) - + p_cc_view(jc, jk, jb); + z_d[2] = p_cc_view(iidx(jc, jb, 2), jk, iblk(jc, jb, 2)) - + p_cc_view(jc, jk, jb); + z_d[3] = p_cc_view(iidx(jc, jb, 3), jk, iblk(jc, jb, 3)) - + p_cc_view(jc, jk, jb); + z_d[4] = p_cc_view(iidx(jc, jb, 4), jk, iblk(jc, jb, 4)) - + p_cc_view(jc, jk, jb); + z_d[5] = p_cc_view(iidx(jc, jb, 5), jk, iblk(jc, jb, 5)) - + p_cc_view(jc, jk, jb); + z_d[6] = p_cc_view(iidx(jc, jb, 6), jk, iblk(jc, jb, 6)) - + p_cc_view(jc, jk, jb); + z_d[7] = p_cc_view(iidx(jc, jb, 7), jk, iblk(jc, jb, 7)) - + p_cc_view(jc, jk, jb); + z_d[8] = p_cc_view(iidx(jc, jb, 8), jk, iblk(jc, jb, 8)) - p_cc_view(jc, jk, jb); - }); - Kokkos::parallel_for( - "recon_lsq_cell_q_step2", innerPolicy, - KOKKOS_LAMBDA(const int jk, const int jc) { - z_qt_times_d(0) = lsq_qtmat_c_view(jc, 0, 0, jb) * z_d(0, jc, jk) + - lsq_qtmat_c_view(jc, 0, 1, jb) * z_d(1, jc, jk) + - lsq_qtmat_c_view(jc, 0, 2, jb) * z_d(2, jc, jk) + - lsq_qtmat_c_view(jc, 0, 3, jb) * z_d(3, jc, jk) + - lsq_qtmat_c_view(jc, 0, 4, jb) * z_d(4, jc, jk) + - lsq_qtmat_c_view(jc, 0, 5, jb) * z_d(5, jc, jk) + - lsq_qtmat_c_view(jc, 0, 6, jb) * z_d(6, jc, jk) + - lsq_qtmat_c_view(jc, 0, 7, jb) * z_d(7, jc, jk) + - lsq_qtmat_c_view(jc, 0, 8, jb) * z_d(8, jc, jk); - z_qt_times_d(1) = lsq_qtmat_c_view(jc, 1, 0, jb) * z_d(0, jc, jk) + - lsq_qtmat_c_view(jc, 1, 1, jb) * z_d(1, jc, jk) + - lsq_qtmat_c_view(jc, 1, 2, jb) * z_d(2, jc, jk) + - lsq_qtmat_c_view(jc, 1, 3, jb) * z_d(3, jc, jk) + - lsq_qtmat_c_view(jc, 1, 4, jb) * z_d(4, jc, jk) + - lsq_qtmat_c_view(jc, 1, 5, jb) * z_d(5, jc, jk) + - lsq_qtmat_c_view(jc, 1, 6, jb) * z_d(6, jc, jk) + - lsq_qtmat_c_view(jc, 1, 7, jb) * z_d(7, jc, jk) + - lsq_qtmat_c_view(jc, 1, 8, jb) * z_d(8, jc, jk); - z_qt_times_d(2) = lsq_qtmat_c_view(jc, 2, 0, jb) * z_d(0, jc, jk) + - lsq_qtmat_c_view(jc, 2, 1, jb) * z_d(1, jc, jk) + - lsq_qtmat_c_view(jc, 2, 2, jb) * z_d(2, jc, jk) + - lsq_qtmat_c_view(jc, 2, 3, jb) * z_d(3, jc, jk) + - lsq_qtmat_c_view(jc, 2, 4, jb) * z_d(4, jc, jk) + - lsq_qtmat_c_view(jc, 2, 5, jb) * z_d(5, jc, jk) + - lsq_qtmat_c_view(jc, 2, 6, jb) * z_d(6, jc, jk) + - lsq_qtmat_c_view(jc, 2, 7, jb) * z_d(7, jc, jk) + - lsq_qtmat_c_view(jc, 2, 8, jb) * z_d(8, jc, jk); - z_qt_times_d(3) = lsq_qtmat_c_view(jc, 3, 0, jb) * z_d(0, jc, jk) + - lsq_qtmat_c_view(jc, 3, 1, jb) * z_d(1, jc, jk) + - lsq_qtmat_c_view(jc, 3, 2, jb) * z_d(2, jc, jk) + - lsq_qtmat_c_view(jc, 3, 3, jb) * z_d(3, jc, jk) + - lsq_qtmat_c_view(jc, 3, 4, jb) * z_d(4, jc, jk) + - lsq_qtmat_c_view(jc, 3, 5, jb) * z_d(5, jc, jk) + - lsq_qtmat_c_view(jc, 3, 6, jb) * z_d(6, jc, jk) + - lsq_qtmat_c_view(jc, 3, 7, jb) * z_d(7, jc, jk) + - lsq_qtmat_c_view(jc, 3, 8, jb) * z_d(8, jc, jk); - z_qt_times_d(4) = lsq_qtmat_c_view(jc, 4, 0, jb) * z_d(0, jc, jk) + - lsq_qtmat_c_view(jc, 4, 1, jb) * z_d(1, jc, jk) + - lsq_qtmat_c_view(jc, 4, 2, jb) * z_d(2, jc, jk) + - lsq_qtmat_c_view(jc, 4, 3, jb) * z_d(3, jc, jk) + - lsq_qtmat_c_view(jc, 4, 4, jb) * z_d(4, jc, jk) + - lsq_qtmat_c_view(jc, 4, 5, jb) * z_d(5, jc, jk) + - lsq_qtmat_c_view(jc, 4, 6, jb) * z_d(6, jc, jk) + - lsq_qtmat_c_view(jc, 4, 7, jb) * z_d(7, jc, jk) + - lsq_qtmat_c_view(jc, 4, 8, jb) * z_d(8, jc, jk); - - p_coeff_view(5, jc, jk, jb) = ptr_rrdiag(jc, 4, jb) * z_qt_times_d(4); + + z_qt_times_d[0] = lsq_qtmat_c_view(jc, 0, 0, jb) * z_d[0] + + lsq_qtmat_c_view(jc, 0, 1, jb) * z_d[1] + + lsq_qtmat_c_view(jc, 0, 2, jb) * z_d[2] + + lsq_qtmat_c_view(jc, 0, 3, jb) * z_d[3] + + lsq_qtmat_c_view(jc, 0, 4, jb) * z_d[4] + + lsq_qtmat_c_view(jc, 0, 5, jb) * z_d[5] + + lsq_qtmat_c_view(jc, 0, 6, jb) * z_d[6] + + lsq_qtmat_c_view(jc, 0, 7, jb) * z_d[7] + + lsq_qtmat_c_view(jc, 0, 8, jb) * z_d[8]; + z_qt_times_d[1] = lsq_qtmat_c_view(jc, 1, 0, jb) * z_d[0] + + lsq_qtmat_c_view(jc, 1, 1, jb) * z_d[1] + + lsq_qtmat_c_view(jc, 1, 2, jb) * z_d[2] + + lsq_qtmat_c_view(jc, 1, 3, jb) * z_d[3] + + lsq_qtmat_c_view(jc, 1, 4, jb) * z_d[4] + + lsq_qtmat_c_view(jc, 1, 5, jb) * z_d[5] + + lsq_qtmat_c_view(jc, 1, 6, jb) * z_d[6] + + lsq_qtmat_c_view(jc, 1, 7, jb) * z_d[7] + + lsq_qtmat_c_view(jc, 1, 8, jb) * z_d[8]; + z_qt_times_d[2] = lsq_qtmat_c_view(jc, 2, 0, jb) * z_d[0] + + lsq_qtmat_c_view(jc, 2, 1, jb) * z_d[1] + + lsq_qtmat_c_view(jc, 2, 2, jb) * z_d[2] + + lsq_qtmat_c_view(jc, 2, 3, jb) * z_d[3] + + lsq_qtmat_c_view(jc, 2, 4, jb) * z_d[4] + + lsq_qtmat_c_view(jc, 2, 5, jb) * z_d[5] + + lsq_qtmat_c_view(jc, 2, 6, jb) * z_d[6] + + lsq_qtmat_c_view(jc, 2, 7, jb) * z_d[7] + + lsq_qtmat_c_view(jc, 2, 8, jb) * z_d[8]; + z_qt_times_d[3] = lsq_qtmat_c_view(jc, 3, 0, jb) * z_d[0] + + lsq_qtmat_c_view(jc, 3, 1, jb) * z_d[1] + + lsq_qtmat_c_view(jc, 3, 2, jb) * z_d[2] + + lsq_qtmat_c_view(jc, 3, 3, jb) * z_d[3] + + lsq_qtmat_c_view(jc, 3, 4, jb) * z_d[4] + + lsq_qtmat_c_view(jc, 3, 5, jb) * z_d[5] + + lsq_qtmat_c_view(jc, 3, 6, jb) * z_d[6] + + lsq_qtmat_c_view(jc, 3, 7, jb) * z_d[7] + + lsq_qtmat_c_view(jc, 3, 8, jb) * z_d[8]; + z_qt_times_d[4] = lsq_qtmat_c_view(jc, 4, 0, jb) * z_d[0] + + lsq_qtmat_c_view(jc, 4, 1, jb) * z_d[1] + + lsq_qtmat_c_view(jc, 4, 2, jb) * z_d[2] + + lsq_qtmat_c_view(jc, 4, 3, jb) * z_d[3] + + lsq_qtmat_c_view(jc, 4, 4, jb) * z_d[4] + + lsq_qtmat_c_view(jc, 4, 5, jb) * z_d[5] + + lsq_qtmat_c_view(jc, 4, 6, jb) * z_d[6] + + lsq_qtmat_c_view(jc, 4, 7, jb) * z_d[7] + + lsq_qtmat_c_view(jc, 4, 8, jb) * z_d[8]; + + p_coeff_view(5, jc, jk, jb) = ptr_rrdiag(jc, 4, jb) * z_qt_times_d[4]; p_coeff_view(4, jc, jk, jb) = ptr_rrdiag(jc, 3, jb) * - (z_qt_times_d(3) - + (z_qt_times_d[3] - ptr_rutri(jc, 0, jb) * p_coeff_view(5, jc, jk, jb)); p_coeff_view(3, jc, jk, jb) = ptr_rrdiag(jc, 2, jb) * - (z_qt_times_d(2) - + (z_qt_times_d[2] - ptr_rutri(jc, 1, jb) * p_coeff_view(4, jc, jk, jb) - ptr_rutri(jc, 2, jb) * p_coeff_view(5, jc, jk, jb)); p_coeff_view(2, jc, jk, jb) = ptr_rrdiag(jc, 1, jb) * - (z_qt_times_d(1) - + (z_qt_times_d[1] - ptr_rutri(jc, 3, jb) * p_coeff_view(3, jc, jk, jb) - ptr_rutri(jc, 4, jb) * p_coeff_view(4, jc, jk, jb) - ptr_rutri(jc, 5, jb) * p_coeff_view(5, jc, jk, jb)); p_coeff_view(1, jc, jk, jb) = ptr_rrdiag(jc, 0, jb) * - (z_qt_times_d(0) - + (z_qt_times_d[0] - ptr_rutri(jc, 6, jb) * p_coeff_view(2, jc, jk, jb) - ptr_rutri(jc, 7, jb) * p_coeff_view(3, jc, jk, jb) - ptr_rutri(jc, 8, jb) * p_coeff_view(4, jc, jk, jb) - @@ -365,8 +360,6 @@ void recon_lsq_cell_q_svd(const T *p_cc, const int *lsq_idx_c, Kokkos::MemoryUnmanaged> UnmanagedConstInt3D; - Kokkos::View<T ***> z_b("z_b", lsq_dim_c, nproma, elev); - UnmanagedConstInt3D iidx(lsq_idx_c, nproma, nblks_c, lsq_dim_c); UnmanagedConstInt3D iblk(lsq_blk_c, nproma, nblks_c, lsq_dim_c); @@ -388,88 +381,86 @@ void recon_lsq_cell_q_svd(const T *p_cc, const int *lsq_idx_c, }); } - for (int jb = i_startblk; jb < i_endblk; ++jb) { + for (int jb = i_startblk; jb <= i_endblk; ++jb) { int i_startidx, i_endidx; get_indices_c_lib(i_startidx_in, i_endidx_in, nproma, jb, i_startblk, i_endblk, i_startidx, i_endidx); Kokkos::MDRangePolicy<Kokkos::Rank<2>> innerPolicy({slev, i_startidx}, - {elev, i_endidx}); + {elev + 1, i_endidx + 1}); Kokkos::parallel_for( "recon_lsq_cell_q_svd_step1", innerPolicy, KOKKOS_LAMBDA(const int jk, const int jc) { - z_b(0, jc, jk) = p_cc_view(iidx(jc, jb, 0), jk, iblk(jc, jb, 0)) - + T z_b[9]; // Local array instead of shared View + z_b[0] = p_cc_view(iidx(jc, jb, 0), jk, iblk(jc, jb, 0)) - p_cc_view(jc, jk, jb); - z_b(1, jc, jk) = p_cc_view(iidx(jc, jb, 1), jk, iblk(jc, jb, 1)) - + z_b[1] = p_cc_view(iidx(jc, jb, 1), jk, iblk(jc, jb, 1)) - p_cc_view(jc, jk, jb); - z_b(2, jc, jk) = p_cc_view(iidx(jc, jb, 2), jk, iblk(jc, jb, 2)) - + z_b[2] = p_cc_view(iidx(jc, jb, 2), jk, iblk(jc, jb, 2)) - p_cc_view(jc, jk, jb); - z_b(3, jc, jk) = p_cc_view(iidx(jc, jb, 3), jk, iblk(jc, jb, 3)) - + z_b[3] = p_cc_view(iidx(jc, jb, 3), jk, iblk(jc, jb, 3)) - p_cc_view(jc, jk, jb); - z_b(4, jc, jk) = p_cc_view(iidx(jc, jb, 4), jk, iblk(jc, jb, 4)) - + z_b[4] = p_cc_view(iidx(jc, jb, 4), jk, iblk(jc, jb, 4)) - p_cc_view(jc, jk, jb); - z_b(5, jc, jk) = p_cc_view(iidx(jc, jb, 5), jk, iblk(jc, jb, 5)) - + z_b[5] = p_cc_view(iidx(jc, jb, 5), jk, iblk(jc, jb, 5)) - p_cc_view(jc, jk, jb); - z_b(6, jc, jk) = p_cc_view(iidx(jc, jb, 6), jk, iblk(jc, jb, 6)) - + z_b[6] = p_cc_view(iidx(jc, jb, 6), jk, iblk(jc, jb, 6)) - p_cc_view(jc, jk, jb); - z_b(7, jc, jk) = p_cc_view(iidx(jc, jb, 7), jk, iblk(jc, jb, 7)) - + z_b[7] = p_cc_view(iidx(jc, jb, 7), jk, iblk(jc, jb, 7)) - p_cc_view(jc, jk, jb); - z_b(8, jc, jk) = p_cc_view(iidx(jc, jb, 8), jk, iblk(jc, jb, 8)) - + z_b[8] = p_cc_view(iidx(jc, jb, 8), jk, iblk(jc, jb, 8)) - p_cc_view(jc, jk, jb); - }); - Kokkos::parallel_for( - "recon_lsq_cell_q_svd_step2", innerPolicy, - KOKKOS_LAMBDA(const int jk, const int jc) { + p_coeff_view(5, jc, jk, jb) = - lsq_pseudoinv_view(jc, 4, 0, jb) * z_b(0, jc, jk) + - lsq_pseudoinv_view(jc, 4, 1, jb) * z_b(1, jc, jk) + - lsq_pseudoinv_view(jc, 4, 2, jb) * z_b(2, jc, jk) + - lsq_pseudoinv_view(jc, 4, 3, jb) * z_b(3, jc, jk) + - lsq_pseudoinv_view(jc, 4, 4, jb) * z_b(4, jc, jk) + - lsq_pseudoinv_view(jc, 4, 5, jb) * z_b(5, jc, jk) + - lsq_pseudoinv_view(jc, 4, 6, jb) * z_b(6, jc, jk) + - lsq_pseudoinv_view(jc, 4, 7, jb) * z_b(7, jc, jk) + - lsq_pseudoinv_view(jc, 4, 8, jb) * z_b(8, jc, jk); + lsq_pseudoinv_view(jc, 4, 0, jb) * z_b[0] + + lsq_pseudoinv_view(jc, 4, 1, jb) * z_b[1] + + lsq_pseudoinv_view(jc, 4, 2, jb) * z_b[2] + + lsq_pseudoinv_view(jc, 4, 3, jb) * z_b[3] + + lsq_pseudoinv_view(jc, 4, 4, jb) * z_b[4] + + lsq_pseudoinv_view(jc, 4, 5, jb) * z_b[5] + + lsq_pseudoinv_view(jc, 4, 6, jb) * z_b[6] + + lsq_pseudoinv_view(jc, 4, 7, jb) * z_b[7] + + lsq_pseudoinv_view(jc, 4, 8, jb) * z_b[8]; p_coeff_view(4, jc, jk, jb) = - lsq_pseudoinv_view(jc, 3, 0, jb) * z_b(0, jc, jk) + - lsq_pseudoinv_view(jc, 3, 1, jb) * z_b(1, jc, jk) + - lsq_pseudoinv_view(jc, 3, 2, jb) * z_b(2, jc, jk) + - lsq_pseudoinv_view(jc, 3, 3, jb) * z_b(3, jc, jk) + - lsq_pseudoinv_view(jc, 3, 4, jb) * z_b(4, jc, jk) + - lsq_pseudoinv_view(jc, 3, 5, jb) * z_b(5, jc, jk) + - lsq_pseudoinv_view(jc, 3, 6, jb) * z_b(6, jc, jk) + - lsq_pseudoinv_view(jc, 3, 7, jb) * z_b(7, jc, jk) + - lsq_pseudoinv_view(jc, 3, 8, jb) * z_b(8, jc, jk); + lsq_pseudoinv_view(jc, 3, 0, jb) * z_b[0] + + lsq_pseudoinv_view(jc, 3, 1, jb) * z_b[1] + + lsq_pseudoinv_view(jc, 3, 2, jb) * z_b[2] + + lsq_pseudoinv_view(jc, 3, 3, jb) * z_b[3] + + lsq_pseudoinv_view(jc, 3, 4, jb) * z_b[4] + + lsq_pseudoinv_view(jc, 3, 5, jb) * z_b[5] + + lsq_pseudoinv_view(jc, 3, 6, jb) * z_b[6] + + lsq_pseudoinv_view(jc, 3, 7, jb) * z_b[7] + + lsq_pseudoinv_view(jc, 3, 8, jb) * z_b[8]; p_coeff_view(3, jc, jk, jb) = - lsq_pseudoinv_view(jc, 2, 0, jb) * z_b(0, jc, jk) + - lsq_pseudoinv_view(jc, 2, 1, jb) * z_b(1, jc, jk) + - lsq_pseudoinv_view(jc, 2, 2, jb) * z_b(2, jc, jk) + - lsq_pseudoinv_view(jc, 2, 3, jb) * z_b(3, jc, jk) + - lsq_pseudoinv_view(jc, 2, 4, jb) * z_b(4, jc, jk) + - lsq_pseudoinv_view(jc, 2, 5, jb) * z_b(5, jc, jk) + - lsq_pseudoinv_view(jc, 2, 6, jb) * z_b(6, jc, jk) + - lsq_pseudoinv_view(jc, 2, 7, jb) * z_b(7, jc, jk) + - lsq_pseudoinv_view(jc, 2, 8, jb) * z_b(8, jc, jk); + lsq_pseudoinv_view(jc, 2, 0, jb) * z_b[0] + + lsq_pseudoinv_view(jc, 2, 1, jb) * z_b[1] + + lsq_pseudoinv_view(jc, 2, 2, jb) * z_b[2] + + lsq_pseudoinv_view(jc, 2, 3, jb) * z_b[3] + + lsq_pseudoinv_view(jc, 2, 4, jb) * z_b[4] + + lsq_pseudoinv_view(jc, 2, 5, jb) * z_b[5] + + lsq_pseudoinv_view(jc, 2, 6, jb) * z_b[6] + + lsq_pseudoinv_view(jc, 2, 7, jb) * z_b[7] + + lsq_pseudoinv_view(jc, 2, 8, jb) * z_b[8]; p_coeff_view(2, jc, jk, jb) = - lsq_pseudoinv_view(jc, 1, 0, jb) * z_b(0, jc, jk) + - lsq_pseudoinv_view(jc, 1, 1, jb) * z_b(1, jc, jk) + - lsq_pseudoinv_view(jc, 1, 2, jb) * z_b(2, jc, jk) + - lsq_pseudoinv_view(jc, 1, 3, jb) * z_b(3, jc, jk) + - lsq_pseudoinv_view(jc, 1, 4, jb) * z_b(4, jc, jk) + - lsq_pseudoinv_view(jc, 1, 5, jb) * z_b(5, jc, jk) + - lsq_pseudoinv_view(jc, 1, 6, jb) * z_b(6, jc, jk) + - lsq_pseudoinv_view(jc, 1, 7, jb) * z_b(7, jc, jk) + - lsq_pseudoinv_view(jc, 1, 8, jb) * z_b(8, jc, jk); + lsq_pseudoinv_view(jc, 1, 0, jb) * z_b[0] + + lsq_pseudoinv_view(jc, 1, 1, jb) * z_b[1] + + lsq_pseudoinv_view(jc, 1, 2, jb) * z_b[2] + + lsq_pseudoinv_view(jc, 1, 3, jb) * z_b[3] + + lsq_pseudoinv_view(jc, 1, 4, jb) * z_b[4] + + lsq_pseudoinv_view(jc, 1, 5, jb) * z_b[5] + + lsq_pseudoinv_view(jc, 1, 6, jb) * z_b[6] + + lsq_pseudoinv_view(jc, 1, 7, jb) * z_b[7] + + lsq_pseudoinv_view(jc, 1, 8, jb) * z_b[8]; p_coeff_view(1, jc, jk, jb) = - lsq_pseudoinv_view(jc, 0, 0, jb) * z_b(0, jc, jk) + - lsq_pseudoinv_view(jc, 0, 1, jb) * z_b(1, jc, jk) + - lsq_pseudoinv_view(jc, 0, 2, jb) * z_b(2, jc, jk) + - lsq_pseudoinv_view(jc, 0, 3, jb) * z_b(3, jc, jk) + - lsq_pseudoinv_view(jc, 0, 4, jb) * z_b(4, jc, jk) + - lsq_pseudoinv_view(jc, 0, 5, jb) * z_b(5, jc, jk) + - lsq_pseudoinv_view(jc, 0, 6, jb) * z_b(6, jc, jk) + - lsq_pseudoinv_view(jc, 0, 7, jb) * z_b(7, jc, jk) + - lsq_pseudoinv_view(jc, 0, 8, jb) * z_b(8, jc, jk); + lsq_pseudoinv_view(jc, 0, 0, jb) * z_b[0] + + lsq_pseudoinv_view(jc, 0, 1, jb) * z_b[1] + + lsq_pseudoinv_view(jc, 0, 2, jb) * z_b[2] + + lsq_pseudoinv_view(jc, 0, 3, jb) * z_b[3] + + lsq_pseudoinv_view(jc, 0, 4, jb) * z_b[4] + + lsq_pseudoinv_view(jc, 0, 5, jb) * z_b[5] + + lsq_pseudoinv_view(jc, 0, 6, jb) * z_b[6] + + lsq_pseudoinv_view(jc, 0, 7, jb) * z_b[7] + + lsq_pseudoinv_view(jc, 0, 8, jb) * z_b[8]; p_coeff_view(0, jc, jk, jb) = p_cc_view(jc, jk, jb) - p_coeff_view(1, jc, jk, jb) * lsq_moments_view(jc, jb, 0) - @@ -505,9 +496,6 @@ void recon_lsq_cell_c(const T *p_cc, const int *lsq_idx_c, const int *lsq_blk_c, Kokkos::MemoryUnmanaged> UnmanagedConstInt3D; - Kokkos::View<T ***> z_d("z_d", lsq_dim_c, nproma, elev); - Kokkos::View<T *> z_qt_times_d("z_qt_times_d", 9); - UnmanagedConstInt3D iidx(lsq_idx_c, nproma, nblks_c, lsq_dim_c); UnmanagedConstInt3D iblk(lsq_blk_c, nproma, nblks_c, lsq_dim_c); @@ -533,146 +521,146 @@ void recon_lsq_cell_c(const T *p_cc, const int *lsq_idx_c, const int *lsq_blk_c, }); } - for (int jb = i_startblk; jb < i_endblk; ++jb) { + for (int jb = i_startblk; jb <= i_endblk; ++jb) { int i_startidx, i_endidx; get_indices_c_lib(i_startidx_in, i_endidx_in, nproma, jb, i_startblk, i_endblk, i_startidx, i_endidx); Kokkos::MDRangePolicy<Kokkos::Rank<2>> innerPolicy({slev, i_startidx}, - {elev, i_endidx}); + {elev + 1, i_endidx + 1}); Kokkos::parallel_for( "recon_lsq_cell_c_step1", innerPolicy, KOKKOS_LAMBDA(const int jk, const int jc) { - z_d(0, jc, jk) = p_cc_view(iidx(jc, jb, 0), jk, iblk(jc, jb, 0)) - + T z_d[9]; // Local array instead of shared View + T z_qt_times_d[9]; // Local array instead of shared View + + z_d[0] = p_cc_view(iidx(jc, jb, 0), jk, iblk(jc, jb, 0)) - p_cc_view(jc, jk, jb); - z_d(1, jc, jk) = p_cc_view(iidx(jc, jb, 1), jk, iblk(jc, jb, 1)) - + z_d[1] = p_cc_view(iidx(jc, jb, 1), jk, iblk(jc, jb, 1)) - p_cc_view(jc, jk, jb); - z_d(2, jc, jk) = p_cc_view(iidx(jc, jb, 2), jk, iblk(jc, jb, 2)) - + z_d[2] = p_cc_view(iidx(jc, jb, 2), jk, iblk(jc, jb, 2)) - p_cc_view(jc, jk, jb); - z_d(3, jc, jk) = p_cc_view(iidx(jc, jb, 3), jk, iblk(jc, jb, 3)) - + z_d[3] = p_cc_view(iidx(jc, jb, 3), jk, iblk(jc, jb, 3)) - p_cc_view(jc, jk, jb); - z_d(4, jc, jk) = p_cc_view(iidx(jc, jb, 4), jk, iblk(jc, jb, 4)) - + z_d[4] = p_cc_view(iidx(jc, jb, 4), jk, iblk(jc, jb, 4)) - p_cc_view(jc, jk, jb); - z_d(5, jc, jk) = p_cc_view(iidx(jc, jb, 5), jk, iblk(jc, jb, 5)) - + z_d[5] = p_cc_view(iidx(jc, jb, 5), jk, iblk(jc, jb, 5)) - p_cc_view(jc, jk, jb); - z_d(6, jc, jk) = p_cc_view(iidx(jc, jb, 6), jk, iblk(jc, jb, 6)) - + z_d[6] = p_cc_view(iidx(jc, jb, 6), jk, iblk(jc, jb, 6)) - p_cc_view(jc, jk, jb); - z_d(7, jc, jk) = p_cc_view(iidx(jc, jb, 7), jk, iblk(jc, jb, 7)) - + z_d[7] = p_cc_view(iidx(jc, jb, 7), jk, iblk(jc, jb, 7)) - p_cc_view(jc, jk, jb); - z_d(8, jc, jk) = p_cc_view(iidx(jc, jb, 8), jk, iblk(jc, jb, 8)) - + z_d[8] = p_cc_view(iidx(jc, jb, 8), jk, iblk(jc, jb, 8)) - p_cc_view(jc, jk, jb); - }); - Kokkos::parallel_for( - "recon_lsq_cell_c_step2", innerPolicy, - KOKKOS_LAMBDA(const int jk, const int jc) { - z_qt_times_d(0) = lsq_qtmat_c_view(jc, 0, 0, jb) * z_d(0, jc, jk) + - lsq_qtmat_c_view(jc, 0, 1, jb) * z_d(1, jc, jk) + - lsq_qtmat_c_view(jc, 0, 2, jb) * z_d(2, jc, jk) + - lsq_qtmat_c_view(jc, 0, 3, jb) * z_d(3, jc, jk) + - lsq_qtmat_c_view(jc, 0, 4, jb) * z_d(4, jc, jk) + - lsq_qtmat_c_view(jc, 0, 5, jb) * z_d(5, jc, jk) + - lsq_qtmat_c_view(jc, 0, 6, jb) * z_d(6, jc, jk) + - lsq_qtmat_c_view(jc, 0, 7, jb) * z_d(7, jc, jk) + - lsq_qtmat_c_view(jc, 0, 8, jb) * z_d(8, jc, jk); - z_qt_times_d(1) = lsq_qtmat_c_view(jc, 1, 0, jb) * z_d(0, jc, jk) + - lsq_qtmat_c_view(jc, 1, 1, jb) * z_d(1, jc, jk) + - lsq_qtmat_c_view(jc, 1, 2, jb) * z_d(2, jc, jk) + - lsq_qtmat_c_view(jc, 1, 3, jb) * z_d(3, jc, jk) + - lsq_qtmat_c_view(jc, 1, 4, jb) * z_d(4, jc, jk) + - lsq_qtmat_c_view(jc, 1, 5, jb) * z_d(5, jc, jk) + - lsq_qtmat_c_view(jc, 1, 6, jb) * z_d(6, jc, jk) + - lsq_qtmat_c_view(jc, 1, 7, jb) * z_d(7, jc, jk) + - lsq_qtmat_c_view(jc, 1, 8, jb) * z_d(8, jc, jk); - z_qt_times_d(2) = lsq_qtmat_c_view(jc, 2, 0, jb) * z_d(0, jc, jk) + - lsq_qtmat_c_view(jc, 2, 1, jb) * z_d(1, jc, jk) + - lsq_qtmat_c_view(jc, 2, 2, jb) * z_d(2, jc, jk) + - lsq_qtmat_c_view(jc, 2, 3, jb) * z_d(3, jc, jk) + - lsq_qtmat_c_view(jc, 2, 4, jb) * z_d(4, jc, jk) + - lsq_qtmat_c_view(jc, 2, 5, jb) * z_d(5, jc, jk) + - lsq_qtmat_c_view(jc, 2, 6, jb) * z_d(6, jc, jk) + - lsq_qtmat_c_view(jc, 2, 7, jb) * z_d(7, jc, jk) + - lsq_qtmat_c_view(jc, 2, 8, jb) * z_d(8, jc, jk); - z_qt_times_d(3) = lsq_qtmat_c_view(jc, 3, 0, jb) * z_d(0, jc, jk) + - lsq_qtmat_c_view(jc, 3, 1, jb) * z_d(1, jc, jk) + - lsq_qtmat_c_view(jc, 3, 2, jb) * z_d(2, jc, jk) + - lsq_qtmat_c_view(jc, 3, 3, jb) * z_d(3, jc, jk) + - lsq_qtmat_c_view(jc, 3, 4, jb) * z_d(4, jc, jk) + - lsq_qtmat_c_view(jc, 3, 5, jb) * z_d(5, jc, jk) + - lsq_qtmat_c_view(jc, 3, 6, jb) * z_d(6, jc, jk) + - lsq_qtmat_c_view(jc, 3, 7, jb) * z_d(7, jc, jk) + - lsq_qtmat_c_view(jc, 3, 8, jb) * z_d(8, jc, jk); - z_qt_times_d(4) = lsq_qtmat_c_view(jc, 4, 0, jb) * z_d(0, jc, jk) + - lsq_qtmat_c_view(jc, 4, 1, jb) * z_d(1, jc, jk) + - lsq_qtmat_c_view(jc, 4, 2, jb) * z_d(2, jc, jk) + - lsq_qtmat_c_view(jc, 4, 3, jb) * z_d(3, jc, jk) + - lsq_qtmat_c_view(jc, 4, 4, jb) * z_d(4, jc, jk) + - lsq_qtmat_c_view(jc, 4, 5, jb) * z_d(5, jc, jk) + - lsq_qtmat_c_view(jc, 4, 6, jb) * z_d(6, jc, jk) + - lsq_qtmat_c_view(jc, 4, 7, jb) * z_d(7, jc, jk) + - lsq_qtmat_c_view(jc, 4, 8, jb) * z_d(8, jc, jk); - z_qt_times_d(5) = lsq_qtmat_c_view(jc, 5, 0, jb) * z_d(0, jc, jk) + - lsq_qtmat_c_view(jc, 5, 1, jb) * z_d(1, jc, jk) + - lsq_qtmat_c_view(jc, 5, 2, jb) * z_d(2, jc, jk) + - lsq_qtmat_c_view(jc, 5, 3, jb) * z_d(3, jc, jk) + - lsq_qtmat_c_view(jc, 5, 4, jb) * z_d(4, jc, jk) + - lsq_qtmat_c_view(jc, 5, 5, jb) * z_d(5, jc, jk) + - lsq_qtmat_c_view(jc, 5, 6, jb) * z_d(6, jc, jk) + - lsq_qtmat_c_view(jc, 5, 7, jb) * z_d(7, jc, jk) + - lsq_qtmat_c_view(jc, 5, 8, jb) * z_d(8, jc, jk); - z_qt_times_d(6) = lsq_qtmat_c_view(jc, 6, 0, jb) * z_d(0, jc, jk) + - lsq_qtmat_c_view(jc, 6, 1, jb) * z_d(1, jc, jk) + - lsq_qtmat_c_view(jc, 6, 2, jb) * z_d(2, jc, jk) + - lsq_qtmat_c_view(jc, 6, 3, jb) * z_d(3, jc, jk) + - lsq_qtmat_c_view(jc, 6, 4, jb) * z_d(4, jc, jk) + - lsq_qtmat_c_view(jc, 6, 5, jb) * z_d(5, jc, jk) + - lsq_qtmat_c_view(jc, 6, 6, jb) * z_d(6, jc, jk) + - lsq_qtmat_c_view(jc, 6, 7, jb) * z_d(7, jc, jk) + - lsq_qtmat_c_view(jc, 6, 8, jb) * z_d(8, jc, jk); - z_qt_times_d(7) = lsq_qtmat_c_view(jc, 7, 0, jb) * z_d(0, jc, jk) + - lsq_qtmat_c_view(jc, 7, 1, jb) * z_d(1, jc, jk) + - lsq_qtmat_c_view(jc, 7, 2, jb) * z_d(2, jc, jk) + - lsq_qtmat_c_view(jc, 7, 3, jb) * z_d(3, jc, jk) + - lsq_qtmat_c_view(jc, 7, 4, jb) * z_d(4, jc, jk) + - lsq_qtmat_c_view(jc, 7, 5, jb) * z_d(5, jc, jk) + - lsq_qtmat_c_view(jc, 7, 6, jb) * z_d(6, jc, jk) + - lsq_qtmat_c_view(jc, 7, 7, jb) * z_d(7, jc, jk) + - lsq_qtmat_c_view(jc, 7, 8, jb) * z_d(8, jc, jk); - z_qt_times_d(8) = lsq_qtmat_c_view(jc, 8, 0, jb) * z_d(0, jc, jk) + - lsq_qtmat_c_view(jc, 8, 1, jb) * z_d(1, jc, jk) + - lsq_qtmat_c_view(jc, 8, 2, jb) * z_d(2, jc, jk) + - lsq_qtmat_c_view(jc, 8, 3, jb) * z_d(3, jc, jk) + - lsq_qtmat_c_view(jc, 8, 4, jb) * z_d(4, jc, jk) + - lsq_qtmat_c_view(jc, 8, 5, jb) * z_d(5, jc, jk) + - lsq_qtmat_c_view(jc, 8, 6, jb) * z_d(6, jc, jk) + - lsq_qtmat_c_view(jc, 8, 7, jb) * z_d(7, jc, jk) + - lsq_qtmat_c_view(jc, 8, 8, jb) * z_d(8, jc, jk); - - p_coeff_view(9, jc, jk, jb) = ptr_rrdiag(jc, 8, jb) * z_qt_times_d(8); + + z_qt_times_d[0] = lsq_qtmat_c_view(jc, 0, 0, jb) * z_d[0] + + lsq_qtmat_c_view(jc, 0, 1, jb) * z_d[1] + + lsq_qtmat_c_view(jc, 0, 2, jb) * z_d[2] + + lsq_qtmat_c_view(jc, 0, 3, jb) * z_d[3] + + lsq_qtmat_c_view(jc, 0, 4, jb) * z_d[4] + + lsq_qtmat_c_view(jc, 0, 5, jb) * z_d[5] + + lsq_qtmat_c_view(jc, 0, 6, jb) * z_d[6] + + lsq_qtmat_c_view(jc, 0, 7, jb) * z_d[7] + + lsq_qtmat_c_view(jc, 0, 8, jb) * z_d[8]; + z_qt_times_d[1] = lsq_qtmat_c_view(jc, 1, 0, jb) * z_d[0] + + lsq_qtmat_c_view(jc, 1, 1, jb) * z_d[1] + + lsq_qtmat_c_view(jc, 1, 2, jb) * z_d[2] + + lsq_qtmat_c_view(jc, 1, 3, jb) * z_d[3] + + lsq_qtmat_c_view(jc, 1, 4, jb) * z_d[4] + + lsq_qtmat_c_view(jc, 1, 5, jb) * z_d[5] + + lsq_qtmat_c_view(jc, 1, 6, jb) * z_d[6] + + lsq_qtmat_c_view(jc, 1, 7, jb) * z_d[7] + + lsq_qtmat_c_view(jc, 1, 8, jb) * z_d[8]; + z_qt_times_d[2] = lsq_qtmat_c_view(jc, 2, 0, jb) * z_d[0] + + lsq_qtmat_c_view(jc, 2, 1, jb) * z_d[1] + + lsq_qtmat_c_view(jc, 2, 2, jb) * z_d[2] + + lsq_qtmat_c_view(jc, 2, 3, jb) * z_d[3] + + lsq_qtmat_c_view(jc, 2, 4, jb) * z_d[4] + + lsq_qtmat_c_view(jc, 2, 5, jb) * z_d[5] + + lsq_qtmat_c_view(jc, 2, 6, jb) * z_d[6] + + lsq_qtmat_c_view(jc, 2, 7, jb) * z_d[7] + + lsq_qtmat_c_view(jc, 2, 8, jb) * z_d[8]; + z_qt_times_d[3] = lsq_qtmat_c_view(jc, 3, 0, jb) * z_d[0] + + lsq_qtmat_c_view(jc, 3, 1, jb) * z_d[1] + + lsq_qtmat_c_view(jc, 3, 2, jb) * z_d[2] + + lsq_qtmat_c_view(jc, 3, 3, jb) * z_d[3] + + lsq_qtmat_c_view(jc, 3, 4, jb) * z_d[4] + + lsq_qtmat_c_view(jc, 3, 5, jb) * z_d[5] + + lsq_qtmat_c_view(jc, 3, 6, jb) * z_d[6] + + lsq_qtmat_c_view(jc, 3, 7, jb) * z_d[7] + + lsq_qtmat_c_view(jc, 3, 8, jb) * z_d[8]; + z_qt_times_d[4] = lsq_qtmat_c_view(jc, 4, 0, jb) * z_d[0] + + lsq_qtmat_c_view(jc, 4, 1, jb) * z_d[1] + + lsq_qtmat_c_view(jc, 4, 2, jb) * z_d[2] + + lsq_qtmat_c_view(jc, 4, 3, jb) * z_d[3] + + lsq_qtmat_c_view(jc, 4, 4, jb) * z_d[4] + + lsq_qtmat_c_view(jc, 4, 5, jb) * z_d[5] + + lsq_qtmat_c_view(jc, 4, 6, jb) * z_d[6] + + lsq_qtmat_c_view(jc, 4, 7, jb) * z_d[7] + + lsq_qtmat_c_view(jc, 4, 8, jb) * z_d[8]; + z_qt_times_d[5] = lsq_qtmat_c_view(jc, 5, 0, jb) * z_d[0] + + lsq_qtmat_c_view(jc, 5, 1, jb) * z_d[1] + + lsq_qtmat_c_view(jc, 5, 2, jb) * z_d[2] + + lsq_qtmat_c_view(jc, 5, 3, jb) * z_d[3] + + lsq_qtmat_c_view(jc, 5, 4, jb) * z_d[4] + + lsq_qtmat_c_view(jc, 5, 5, jb) * z_d[5] + + lsq_qtmat_c_view(jc, 5, 6, jb) * z_d[6] + + lsq_qtmat_c_view(jc, 5, 7, jb) * z_d[7] + + lsq_qtmat_c_view(jc, 5, 8, jb) * z_d[8]; + z_qt_times_d[6] = lsq_qtmat_c_view(jc, 6, 0, jb) * z_d[0] + + lsq_qtmat_c_view(jc, 6, 1, jb) * z_d[1] + + lsq_qtmat_c_view(jc, 6, 2, jb) * z_d[2] + + lsq_qtmat_c_view(jc, 6, 3, jb) * z_d[3] + + lsq_qtmat_c_view(jc, 6, 4, jb) * z_d[4] + + lsq_qtmat_c_view(jc, 6, 5, jb) * z_d[5] + + lsq_qtmat_c_view(jc, 6, 6, jb) * z_d[6] + + lsq_qtmat_c_view(jc, 6, 7, jb) * z_d[7] + + lsq_qtmat_c_view(jc, 6, 8, jb) * z_d[8]; + z_qt_times_d[7] = lsq_qtmat_c_view(jc, 7, 0, jb) * z_d[0] + + lsq_qtmat_c_view(jc, 7, 1, jb) * z_d[1] + + lsq_qtmat_c_view(jc, 7, 2, jb) * z_d[2] + + lsq_qtmat_c_view(jc, 7, 3, jb) * z_d[3] + + lsq_qtmat_c_view(jc, 7, 4, jb) * z_d[4] + + lsq_qtmat_c_view(jc, 7, 5, jb) * z_d[5] + + lsq_qtmat_c_view(jc, 7, 6, jb) * z_d[6] + + lsq_qtmat_c_view(jc, 7, 7, jb) * z_d[7] + + lsq_qtmat_c_view(jc, 7, 8, jb) * z_d[8]; + z_qt_times_d[8] = lsq_qtmat_c_view(jc, 8, 0, jb) * z_d[0] + + lsq_qtmat_c_view(jc, 8, 1, jb) * z_d[1] + + lsq_qtmat_c_view(jc, 8, 2, jb) * z_d[2] + + lsq_qtmat_c_view(jc, 8, 3, jb) * z_d[3] + + lsq_qtmat_c_view(jc, 8, 4, jb) * z_d[4] + + lsq_qtmat_c_view(jc, 8, 5, jb) * z_d[5] + + lsq_qtmat_c_view(jc, 8, 6, jb) * z_d[6] + + lsq_qtmat_c_view(jc, 8, 7, jb) * z_d[7] + + lsq_qtmat_c_view(jc, 8, 8, jb) * z_d[8]; + + p_coeff_view(9, jc, jk, jb) = ptr_rrdiag(jc, 8, jb) * z_qt_times_d[8]; p_coeff_view(8, jc, jk, jb) = ptr_rrdiag(jc, 7, jb) * - (z_qt_times_d(7) - + (z_qt_times_d[7] - ptr_rutri(jc, 0, jb) * p_coeff_view(9, jc, jk, jb)); p_coeff_view(7, jc, jk, jb) = ptr_rrdiag(jc, 6, jb) * - (z_qt_times_d(6) - + (z_qt_times_d[6] - (ptr_rutri(jc, 1, jb) * p_coeff_view(8, jc, jk, jb) + ptr_rutri(jc, 2, jb) * p_coeff_view(9, jc, jk, jb))); p_coeff_view(6, jc, jk, jb) = ptr_rrdiag(jc, 5, jb) * - (z_qt_times_d(5) - + (z_qt_times_d[5] - (ptr_rutri(jc, 3, jb) * p_coeff_view(7, jc, jk, jb) + ptr_rutri(jc, 4, jb) * p_coeff_view(8, jc, jk, jb) + ptr_rutri(jc, 5, jb) * p_coeff_view(9, jc, jk, jb))); p_coeff_view(5, jc, jk, jb) = ptr_rrdiag(jc, 4, jb) * - (z_qt_times_d(4) - + (z_qt_times_d[4] - (ptr_rutri(jc, 6, jb) * p_coeff_view(6, jc, jk, jb) + ptr_rutri(jc, 7, jb) * p_coeff_view(7, jc, jk, jb) + ptr_rutri(jc, 8, jb) * p_coeff_view(8, jc, jk, jb) + ptr_rutri(jc, 9, jb) * p_coeff_view(9, jc, jk, jb))); p_coeff_view(4, jc, jk, jb) = ptr_rrdiag(jc, 3, jb) * - (z_qt_times_d(3) - + (z_qt_times_d[3] - (ptr_rutri(jc, 10, jb) * p_coeff_view(5, jc, jk, jb) + ptr_rutri(jc, 11, jb) * p_coeff_view(6, jc, jk, jb) + ptr_rutri(jc, 12, jb) * p_coeff_view(7, jc, jk, jb) + @@ -680,7 +668,7 @@ void recon_lsq_cell_c(const T *p_cc, const int *lsq_idx_c, const int *lsq_blk_c, ptr_rutri(jc, 14, jb) * p_coeff_view(9, jc, jk, jb))); p_coeff_view(3, jc, jk, jb) = ptr_rrdiag(jc, 2, jb) * - (z_qt_times_d(2) - + (z_qt_times_d[2] - (ptr_rutri(jc, 15, jb) * p_coeff_view(4, jc, jk, jb) + ptr_rutri(jc, 16, jb) * p_coeff_view(5, jc, jk, jb) + ptr_rutri(jc, 17, jb) * p_coeff_view(6, jc, jk, jb) + @@ -689,7 +677,7 @@ void recon_lsq_cell_c(const T *p_cc, const int *lsq_idx_c, const int *lsq_blk_c, ptr_rutri(jc, 20, jb) * p_coeff_view(9, jc, jk, jb))); p_coeff_view(2, jc, jk, jb) = ptr_rrdiag(jc, 1, jb) * - (z_qt_times_d(1) - + (z_qt_times_d[1] - (ptr_rutri(jc, 21, jb) * p_coeff_view(3, jc, jk, jb) + ptr_rutri(jc, 22, jb) * p_coeff_view(4, jc, jk, jb) + ptr_rutri(jc, 23, jb) * p_coeff_view(5, jc, jk, jb) + @@ -699,7 +687,7 @@ void recon_lsq_cell_c(const T *p_cc, const int *lsq_idx_c, const int *lsq_blk_c, ptr_rutri(jc, 27, jb) * p_coeff_view(9, jc, jk, jb))); p_coeff_view(1, jc, jk, jb) = ptr_rrdiag(jc, 0, jb) * - (z_qt_times_d(0) - + (z_qt_times_d[0] - (ptr_rutri(jc, 28, jb) * p_coeff_view(2, jc, jk, jb) + ptr_rutri(jc, 29, jb) * p_coeff_view(3, jc, jk, jb) + ptr_rutri(jc, 30, jb) * p_coeff_view(4, jc, jk, jb) + @@ -748,8 +736,6 @@ void recon_lsq_cell_c_svd(const T *p_cc, const int *lsq_idx_c, Kokkos::MemoryUnmanaged> UnmanagedConstInt3D; - Kokkos::View<T *> z_b("z_b", 9); - UnmanagedConstInt3D iidx(lsq_idx_c, nproma, nblks_c, lsq_dim_c); UnmanagedConstInt3D iblk(lsq_blk_c, nproma, nblks_c, lsq_dim_c); @@ -761,13 +747,13 @@ void recon_lsq_cell_c_svd(const T *p_cc, const int *lsq_idx_c, UnmanagedConstT3D lsq_moments_view(lsq_moments, nproma, nblks_c, lsq_dim_unk); if (patch_id > 0 || l_limited_area) { - for (int jb = i_startblk; jb < i_endblk; ++jb) { + for (int jb = i_startblk; jb <= i_endblk; ++jb) { int i_startidx, i_endidx; get_indices_c_lib(i_startidx_in, i_endidx_in, nproma, jb, i_startblk, i_endblk, i_startidx, i_endidx); Kokkos::MDRangePolicy<Kokkos::Rank<3>> initPolicy( - {slev, i_startidx, 0}, {elev, i_endidx, lsq_dim_unk + 1}); + {slev, i_startidx, 0}, {elev + 1, i_endidx + 1, lsq_dim_unk + 1}); Kokkos::parallel_for( "recon_lsq_cell_c_svd_init", initPolicy, KOKKOS_LAMBDA(const int jk, const int jc, const int ji) { @@ -776,125 +762,126 @@ void recon_lsq_cell_c_svd(const T *p_cc, const int *lsq_idx_c, } } - for (int jb = i_startblk; jb < i_endblk; ++jb) { + for (int jb = i_startblk; jb <= i_endblk; ++jb) { int i_startidx, i_endidx; get_indices_c_lib(i_startidx_in, i_endidx_in, nproma, jb, i_startblk, i_endblk, i_startidx, i_endidx); Kokkos::MDRangePolicy<Kokkos::Rank<2>> innerPolicy({slev, i_startidx}, - {elev, i_endidx}); + {elev + 1, i_endidx + 1}); Kokkos::parallel_for( "recon_lsq_cell_c_svd_inner", innerPolicy, KOKKOS_LAMBDA(const int jk, const int jc) { - z_b(0) = p_cc_view(iidx(jc, jb, 0), jk, iblk(jc, jb, 0)) - + T z_b[9]; // Local array instead of shared View + z_b[0] = p_cc_view(iidx(jc, jb, 0), jk, iblk(jc, jb, 0)) - p_cc_view(jc, jk, jb); - z_b(1) = p_cc_view(iidx(jc, jb, 1), jk, iblk(jc, jb, 1)) - + z_b[1] = p_cc_view(iidx(jc, jb, 1), jk, iblk(jc, jb, 1)) - p_cc_view(jc, jk, jb); - z_b(2) = p_cc_view(iidx(jc, jb, 2), jk, iblk(jc, jb, 2)) - + z_b[2] = p_cc_view(iidx(jc, jb, 2), jk, iblk(jc, jb, 2)) - p_cc_view(jc, jk, jb); - z_b(3) = p_cc_view(iidx(jc, jb, 3), jk, iblk(jc, jb, 3)) - + z_b[3] = p_cc_view(iidx(jc, jb, 3), jk, iblk(jc, jb, 3)) - p_cc_view(jc, jk, jb); - z_b(4) = p_cc_view(iidx(jc, jb, 4), jk, iblk(jc, jb, 4)) - + z_b[4] = p_cc_view(iidx(jc, jb, 4), jk, iblk(jc, jb, 4)) - p_cc_view(jc, jk, jb); - z_b(5) = p_cc_view(iidx(jc, jb, 5), jk, iblk(jc, jb, 5)) - + z_b[5] = p_cc_view(iidx(jc, jb, 5), jk, iblk(jc, jb, 5)) - p_cc_view(jc, jk, jb); - z_b(6) = p_cc_view(iidx(jc, jb, 6), jk, iblk(jc, jb, 6)) - + z_b[6] = p_cc_view(iidx(jc, jb, 6), jk, iblk(jc, jb, 6)) - p_cc_view(jc, jk, jb); - z_b(7) = p_cc_view(iidx(jc, jb, 7), jk, iblk(jc, jb, 7)) - + z_b[7] = p_cc_view(iidx(jc, jb, 7), jk, iblk(jc, jb, 7)) - p_cc_view(jc, jk, jb); - z_b(8) = p_cc_view(iidx(jc, jb, 8), jk, iblk(jc, jb, 8)) - + z_b[8] = p_cc_view(iidx(jc, jb, 8), jk, iblk(jc, jb, 8)) - p_cc_view(jc, jk, jb); p_coeff_view(9, jc, jk, jb) = - lsq_pseudoinv_view(jc, 8, 0, jb) * z_b(0) + - lsq_pseudoinv_view(jc, 8, 1, jb) * z_b(1) + - lsq_pseudoinv_view(jc, 8, 2, jb) * z_b(2) + - lsq_pseudoinv_view(jc, 8, 3, jb) * z_b(3) + - lsq_pseudoinv_view(jc, 8, 4, jb) * z_b(4) + - lsq_pseudoinv_view(jc, 8, 5, jb) * z_b(5) + - lsq_pseudoinv_view(jc, 8, 6, jb) * z_b(6) + - lsq_pseudoinv_view(jc, 8, 7, jb) * z_b(7) + - lsq_pseudoinv_view(jc, 8, 8, jb) * z_b(8); + lsq_pseudoinv_view(jc, 8, 0, jb) * z_b[0] + + lsq_pseudoinv_view(jc, 8, 1, jb) * z_b[1] + + lsq_pseudoinv_view(jc, 8, 2, jb) * z_b[2] + + lsq_pseudoinv_view(jc, 8, 3, jb) * z_b[3] + + lsq_pseudoinv_view(jc, 8, 4, jb) * z_b[4] + + lsq_pseudoinv_view(jc, 8, 5, jb) * z_b[5] + + lsq_pseudoinv_view(jc, 8, 6, jb) * z_b[6] + + lsq_pseudoinv_view(jc, 8, 7, jb) * z_b[7] + + lsq_pseudoinv_view(jc, 8, 8, jb) * z_b[8]; p_coeff_view(8, jc, jk, jb) = - lsq_pseudoinv_view(jc, 7, 0, jb) * z_b(0) + - lsq_pseudoinv_view(jc, 7, 1, jb) * z_b(1) + - lsq_pseudoinv_view(jc, 7, 2, jb) * z_b(2) + - lsq_pseudoinv_view(jc, 7, 3, jb) * z_b(3) + - lsq_pseudoinv_view(jc, 7, 4, jb) * z_b(4) + - lsq_pseudoinv_view(jc, 7, 5, jb) * z_b(5) + - lsq_pseudoinv_view(jc, 7, 6, jb) * z_b(6) + - lsq_pseudoinv_view(jc, 7, 7, jb) * z_b(7) + - lsq_pseudoinv_view(jc, 7, 8, jb) * z_b(8); + lsq_pseudoinv_view(jc, 7, 0, jb) * z_b[0] + + lsq_pseudoinv_view(jc, 7, 1, jb) * z_b[1] + + lsq_pseudoinv_view(jc, 7, 2, jb) * z_b[2] + + lsq_pseudoinv_view(jc, 7, 3, jb) * z_b[3] + + lsq_pseudoinv_view(jc, 7, 4, jb) * z_b[4] + + lsq_pseudoinv_view(jc, 7, 5, jb) * z_b[5] + + lsq_pseudoinv_view(jc, 7, 6, jb) * z_b[6] + + lsq_pseudoinv_view(jc, 7, 7, jb) * z_b[7] + + lsq_pseudoinv_view(jc, 7, 8, jb) * z_b[8]; p_coeff_view(7, jc, jk, jb) = - lsq_pseudoinv_view(jc, 6, 0, jb) * z_b(0) + - lsq_pseudoinv_view(jc, 6, 1, jb) * z_b(1) + - lsq_pseudoinv_view(jc, 6, 2, jb) * z_b(2) + - lsq_pseudoinv_view(jc, 6, 3, jb) * z_b(3) + - lsq_pseudoinv_view(jc, 6, 4, jb) * z_b(4) + - lsq_pseudoinv_view(jc, 6, 5, jb) * z_b(5) + - lsq_pseudoinv_view(jc, 6, 6, jb) * z_b(6) + - lsq_pseudoinv_view(jc, 6, 7, jb) * z_b(7) + - lsq_pseudoinv_view(jc, 6, 8, jb) * z_b(8); + lsq_pseudoinv_view(jc, 6, 0, jb) * z_b[0] + + lsq_pseudoinv_view(jc, 6, 1, jb) * z_b[1] + + lsq_pseudoinv_view(jc, 6, 2, jb) * z_b[2] + + lsq_pseudoinv_view(jc, 6, 3, jb) * z_b[3] + + lsq_pseudoinv_view(jc, 6, 4, jb) * z_b[4] + + lsq_pseudoinv_view(jc, 6, 5, jb) * z_b[5] + + lsq_pseudoinv_view(jc, 6, 6, jb) * z_b[6] + + lsq_pseudoinv_view(jc, 6, 7, jb) * z_b[7] + + lsq_pseudoinv_view(jc, 6, 8, jb) * z_b[8]; p_coeff_view(6, jc, jk, jb) = - lsq_pseudoinv_view(jc, 5, 0, jb) * z_b(0) + - lsq_pseudoinv_view(jc, 5, 1, jb) * z_b(1) + - lsq_pseudoinv_view(jc, 5, 2, jb) * z_b(2) + - lsq_pseudoinv_view(jc, 5, 3, jb) * z_b(3) + - lsq_pseudoinv_view(jc, 5, 4, jb) * z_b(4) + - lsq_pseudoinv_view(jc, 5, 5, jb) * z_b(5) + - lsq_pseudoinv_view(jc, 5, 6, jb) * z_b(6) + - lsq_pseudoinv_view(jc, 5, 7, jb) * z_b(7) + - lsq_pseudoinv_view(jc, 5, 8, jb) * z_b(8); + lsq_pseudoinv_view(jc, 5, 0, jb) * z_b[0] + + lsq_pseudoinv_view(jc, 5, 1, jb) * z_b[1] + + lsq_pseudoinv_view(jc, 5, 2, jb) * z_b[2] + + lsq_pseudoinv_view(jc, 5, 3, jb) * z_b[3] + + lsq_pseudoinv_view(jc, 5, 4, jb) * z_b[4] + + lsq_pseudoinv_view(jc, 5, 5, jb) * z_b[5] + + lsq_pseudoinv_view(jc, 5, 6, jb) * z_b[6] + + lsq_pseudoinv_view(jc, 5, 7, jb) * z_b[7] + + lsq_pseudoinv_view(jc, 5, 8, jb) * z_b[8]; p_coeff_view(5, jc, jk, jb) = - lsq_pseudoinv_view(jc, 4, 0, jb) * z_b(0) + - lsq_pseudoinv_view(jc, 4, 1, jb) * z_b(1) + - lsq_pseudoinv_view(jc, 4, 2, jb) * z_b(2) + - lsq_pseudoinv_view(jc, 4, 3, jb) * z_b(3) + - lsq_pseudoinv_view(jc, 4, 4, jb) * z_b(4) + - lsq_pseudoinv_view(jc, 4, 5, jb) * z_b(5) + - lsq_pseudoinv_view(jc, 4, 6, jb) * z_b(6) + - lsq_pseudoinv_view(jc, 4, 7, jb) * z_b(7) + - lsq_pseudoinv_view(jc, 4, 8, jb) * z_b(8); + lsq_pseudoinv_view(jc, 4, 0, jb) * z_b[0] + + lsq_pseudoinv_view(jc, 4, 1, jb) * z_b[1] + + lsq_pseudoinv_view(jc, 4, 2, jb) * z_b[2] + + lsq_pseudoinv_view(jc, 4, 3, jb) * z_b[3] + + lsq_pseudoinv_view(jc, 4, 4, jb) * z_b[4] + + lsq_pseudoinv_view(jc, 4, 5, jb) * z_b[5] + + lsq_pseudoinv_view(jc, 4, 6, jb) * z_b[6] + + lsq_pseudoinv_view(jc, 4, 7, jb) * z_b[7] + + lsq_pseudoinv_view(jc, 4, 8, jb) * z_b[8]; p_coeff_view(4, jc, jk, jb) = - lsq_pseudoinv_view(jc, 3, 0, jb) * z_b(0) + - lsq_pseudoinv_view(jc, 3, 1, jb) * z_b(1) + - lsq_pseudoinv_view(jc, 3, 2, jb) * z_b(2) + - lsq_pseudoinv_view(jc, 3, 3, jb) * z_b(3) + - lsq_pseudoinv_view(jc, 3, 4, jb) * z_b(4) + - lsq_pseudoinv_view(jc, 3, 5, jb) * z_b(5) + - lsq_pseudoinv_view(jc, 3, 6, jb) * z_b(6) + - lsq_pseudoinv_view(jc, 3, 7, jb) * z_b(7) + - lsq_pseudoinv_view(jc, 3, 8, jb) * z_b(8); + lsq_pseudoinv_view(jc, 3, 0, jb) * z_b[0] + + lsq_pseudoinv_view(jc, 3, 1, jb) * z_b[1] + + lsq_pseudoinv_view(jc, 3, 2, jb) * z_b[2] + + lsq_pseudoinv_view(jc, 3, 3, jb) * z_b[3] + + lsq_pseudoinv_view(jc, 3, 4, jb) * z_b[4] + + lsq_pseudoinv_view(jc, 3, 5, jb) * z_b[5] + + lsq_pseudoinv_view(jc, 3, 6, jb) * z_b[6] + + lsq_pseudoinv_view(jc, 3, 7, jb) * z_b[7] + + lsq_pseudoinv_view(jc, 3, 8, jb) * z_b[8]; p_coeff_view(3, jc, jk, jb) = - lsq_pseudoinv_view(jc, 2, 0, jb) * z_b(0) + - lsq_pseudoinv_view(jc, 2, 1, jb) * z_b(1) + - lsq_pseudoinv_view(jc, 2, 2, jb) * z_b(2) + - lsq_pseudoinv_view(jc, 2, 3, jb) * z_b(3) + - lsq_pseudoinv_view(jc, 2, 4, jb) * z_b(4) + - lsq_pseudoinv_view(jc, 2, 5, jb) * z_b(5) + - lsq_pseudoinv_view(jc, 2, 6, jb) * z_b(6) + - lsq_pseudoinv_view(jc, 2, 7, jb) * z_b(7) + - lsq_pseudoinv_view(jc, 2, 8, jb) * z_b(8); + lsq_pseudoinv_view(jc, 2, 0, jb) * z_b[0] + + lsq_pseudoinv_view(jc, 2, 1, jb) * z_b[1] + + lsq_pseudoinv_view(jc, 2, 2, jb) * z_b[2] + + lsq_pseudoinv_view(jc, 2, 3, jb) * z_b[3] + + lsq_pseudoinv_view(jc, 2, 4, jb) * z_b[4] + + lsq_pseudoinv_view(jc, 2, 5, jb) * z_b[5] + + lsq_pseudoinv_view(jc, 2, 6, jb) * z_b[6] + + lsq_pseudoinv_view(jc, 2, 7, jb) * z_b[7] + + lsq_pseudoinv_view(jc, 2, 8, jb) * z_b[8]; p_coeff_view(2, jc, jk, jb) = - lsq_pseudoinv_view(jc, 1, 0, jb) * z_b(0) + - lsq_pseudoinv_view(jc, 1, 1, jb) * z_b(1) + - lsq_pseudoinv_view(jc, 1, 2, jb) * z_b(2) + - lsq_pseudoinv_view(jc, 1, 3, jb) * z_b(3) + - lsq_pseudoinv_view(jc, 1, 4, jb) * z_b(4) + - lsq_pseudoinv_view(jc, 1, 5, jb) * z_b(5) + - lsq_pseudoinv_view(jc, 1, 6, jb) * z_b(6) + - lsq_pseudoinv_view(jc, 1, 7, jb) * z_b(7) + - lsq_pseudoinv_view(jc, 1, 8, jb) * z_b(8); + lsq_pseudoinv_view(jc, 1, 0, jb) * z_b[0] + + lsq_pseudoinv_view(jc, 1, 1, jb) * z_b[1] + + lsq_pseudoinv_view(jc, 1, 2, jb) * z_b[2] + + lsq_pseudoinv_view(jc, 1, 3, jb) * z_b[3] + + lsq_pseudoinv_view(jc, 1, 4, jb) * z_b[4] + + lsq_pseudoinv_view(jc, 1, 5, jb) * z_b[5] + + lsq_pseudoinv_view(jc, 1, 6, jb) * z_b[6] + + lsq_pseudoinv_view(jc, 1, 7, jb) * z_b[7] + + lsq_pseudoinv_view(jc, 1, 8, jb) * z_b[8]; p_coeff_view(1, jc, jk, jb) = - lsq_pseudoinv_view(jc, 0, 0, jb) * z_b(0) + - lsq_pseudoinv_view(jc, 0, 1, jb) * z_b(1) + - lsq_pseudoinv_view(jc, 0, 2, jb) * z_b(2) + - lsq_pseudoinv_view(jc, 0, 3, jb) * z_b(3) + - lsq_pseudoinv_view(jc, 0, 4, jb) * z_b(4) + - lsq_pseudoinv_view(jc, 0, 5, jb) * z_b(5) + - lsq_pseudoinv_view(jc, 0, 6, jb) * z_b(6) + - lsq_pseudoinv_view(jc, 0, 7, jb) * z_b(7) + - lsq_pseudoinv_view(jc, 0, 8, jb) * z_b(8); + lsq_pseudoinv_view(jc, 0, 0, jb) * z_b[0] + + lsq_pseudoinv_view(jc, 0, 1, jb) * z_b[1] + + lsq_pseudoinv_view(jc, 0, 2, jb) * z_b[2] + + lsq_pseudoinv_view(jc, 0, 3, jb) * z_b[3] + + lsq_pseudoinv_view(jc, 0, 4, jb) * z_b[4] + + lsq_pseudoinv_view(jc, 0, 5, jb) * z_b[5] + + lsq_pseudoinv_view(jc, 0, 6, jb) * z_b[6] + + lsq_pseudoinv_view(jc, 0, 7, jb) * z_b[7] + + lsq_pseudoinv_view(jc, 0, 8, jb) * z_b[8]; p_coeff_view(0, jc, jk, jb) = p_cc_view(jc, jk, jb) - p_coeff_view(1, jc, jk, jb) * lsq_moments_view(jc, jb, 0) - @@ -936,13 +923,13 @@ void div3d(const T *vec_e, const int *cell_edge_idx, const int *cell_edge_blk, UnmanagedConstT3D geofac_div_view(geofac_div, nproma, 3, nblks_c); UnmanagedT3D div_vec_c_view(div_vec_c, nproma, nlev, nblks_c); - for (int jb = i_startblk; jb < i_endblk; ++jb) { + for (int jb = i_startblk; jb <= i_endblk; ++jb) { int i_startidx, i_endidx; get_indices_c_lib(i_startidx_in, i_endidx_in, nproma, jb, i_startblk, i_endblk, i_startidx, i_endidx); Kokkos::MDRangePolicy<Kokkos::Rank<2>> innerPolicy({slev, i_startidx}, - {elev, i_endidx}); + {elev + 1, i_endidx + 1}); Kokkos::parallel_for( "div3d_inner", innerPolicy, KOKKOS_LAMBDA(const int jk, const int jc) { div_vec_c_view(jc, jk, jb) = @@ -984,13 +971,13 @@ void div3d_2field(const T *vec_e, const int *cell_edge_idx, UnmanagedConstT3D in2_view(in2, nproma, nlev, nblks_e); UnmanagedT3D out2_view(out2, nproma, nlev, nblks_c); - for (int jb = i_startblk; jb < i_endblk; ++jb) { + for (int jb = i_startblk; jb <= i_endblk; ++jb) { int i_startidx, i_endidx; get_indices_c_lib(i_startidx_in, i_endidx_in, nproma, jb, i_startblk, i_endblk, i_startidx, i_endidx); Kokkos::MDRangePolicy<Kokkos::Rank<2>> innerPolicy({slev, i_startidx}, - {elev, i_endidx}); + {elev + 1, i_endidx + 1}); Kokkos::parallel_for( "div3d_2field_inner", innerPolicy, KOKKOS_LAMBDA(const int jk, const int jc) { @@ -1027,8 +1014,6 @@ void div4d(const int *cell_edge_idx, const int *cell_edge_blk, typedef Kokkos::View<const T ****, Kokkos::LayoutLeft, Kokkos::MemoryUnmanaged> UnmanagedConstT4D; - typedef Kokkos::View<T ***, Kokkos::LayoutLeft, Kokkos::MemoryUnmanaged> - UnmanagedT3D; typedef Kokkos::View<T ****, Kokkos::LayoutLeft, Kokkos::MemoryUnmanaged> UnmanagedT4D; typedef Kokkos::View<const int ***, Kokkos::LayoutLeft, @@ -1043,14 +1028,14 @@ void div4d(const int *cell_edge_idx, const int *cell_edge_blk, UnmanagedConstT4D f4din_view(f4din, nproma, nlev, nblks_e, dim4d); UnmanagedT4D f4dout_view(f4dout, nproma, nlev, nblks_c, dim4d); - for (int jb = i_startblk; jb < i_endblk; ++jb) { + for (int jb = i_startblk; jb <= i_endblk; ++jb) { int i_startidx, i_endidx; get_indices_c_lib(i_startidx_in, i_endidx_in, nproma, jb, i_startblk, i_endblk, i_startidx, i_endidx); for (int ji = 0; ji < dim4d; ++ji) { Kokkos::MDRangePolicy<Kokkos::Rank<2>> innerPolicy({slev[ji], i_startidx}, - {elev[ji], i_endidx}); + {elev[ji] + 1, i_endidx + 1}); Kokkos::parallel_for( "div4d_inner", innerPolicy, KOKKOS_LAMBDA(const int jk, const int jc) { @@ -1093,8 +1078,8 @@ void div_avg(const T *vec_e, const int *cell_neighbor_idx, UnmanagedConstInt3D ieidx(cell_edge_idx, nproma, nblks_c, 3); UnmanagedConstInt3D ieblk(cell_edge_blk, nproma, nblks_c, 3); - UnmanagedConstT3D geofac_div_view(geofac_div, nproma, 4, nblks_e); - UnmanagedConstT3D avg_coeff_view(avg_coeff, nproma, nlev, nblks_c); + UnmanagedConstT3D geofac_div_view(geofac_div, nproma, 3, nblks_e); + UnmanagedConstT3D avg_coeff_view(avg_coeff, nproma, 4, nblks_c); UnmanagedT3D div_vec_c_view(div_vec_c, nproma, nlev, nblks_c); @@ -1108,13 +1093,13 @@ void div_avg(const T *vec_e, const int *cell_neighbor_idx, int i_endblk = i_endblk_in[0]; if (l2fields) { - for (int jb = i_startblk; jb < i_endblk; ++jb) { + for (int jb = i_startblk; jb <= i_endblk; ++jb) { int i_startidx, i_endidx; get_indices_c_lib(i_startidx_in[0], i_endidx_in[0], nproma, jb, i_startblk, i_endblk, i_startidx, i_endidx); Kokkos::MDRangePolicy<Kokkos::Rank<2>> innerPolicy({slev, i_startidx}, - {elev, i_endidx}); + {elev + 1, i_endidx + 1}); Kokkos::parallel_for( "div_avg_step1", innerPolicy, KOKKOS_LAMBDA(const int jk, const int jc) { @@ -1135,13 +1120,13 @@ void div_avg(const T *vec_e, const int *cell_neighbor_idx, }); } } else { - for (int jb = i_startblk; jb < i_endblk; ++jb) { + for (int jb = i_startblk; jb <= i_endblk; ++jb) { int i_startidx, i_endidx; get_indices_c_lib(i_startidx_in[0], i_endidx_in[0], nproma, jb, i_startblk, i_endblk, i_startidx, i_endidx); Kokkos::MDRangePolicy<Kokkos::Rank<2>> innerPolicy({slev, i_startidx}, - {elev, i_endidx}); + {elev + 1, i_endidx + 1}); Kokkos::parallel_for( "div_avg_step2", innerPolicy, KOKKOS_LAMBDA(const int jk, const int jc) { @@ -1160,13 +1145,13 @@ void div_avg(const T *vec_e, const int *cell_neighbor_idx, i_startblk = i_startblk_in[1]; i_endblk = i_endblk_in[1]; - for (int jb = i_startblk; jb < i_endblk; ++jb) { + for (int jb = i_startblk; jb <= i_endblk; ++jb) { int i_startidx, i_endidx; get_indices_c_lib(i_startidx_in[1], i_endidx_in[1], nproma, jb, i_startblk, i_endblk, i_startidx, i_endidx); Kokkos::MDRangePolicy<Kokkos::Rank<2>> innerPolicy({slev, i_startidx}, - {elev, i_endidx}); + {elev + 1, i_endidx + 1}); Kokkos::parallel_for( "div_avg_step3", innerPolicy, KOKKOS_LAMBDA(const int jk, const int jc) { @@ -1175,13 +1160,13 @@ void div_avg(const T *vec_e, const int *cell_neighbor_idx, } if (l2fields) { - for (int jb = i_startblk; jb < i_endblk; ++jb) { + for (int jb = i_startblk; jb <= i_endblk; ++jb) { int i_startidx, i_endidx; get_indices_c_lib(i_startidx_in[1], i_endidx_in[1], nproma, jb, i_startblk, i_endblk, i_startidx, i_endidx); Kokkos::MDRangePolicy<Kokkos::Rank<2>> innerPolicy({slev, i_startidx}, - {elev, i_endidx}); + {elev + 1, i_endidx + 1}); Kokkos::parallel_for( "div_avg_step4", innerPolicy, KOKKOS_LAMBDA(const int jk, const int jc) { @@ -1195,13 +1180,13 @@ void div_avg(const T *vec_e, const int *cell_neighbor_idx, i_endblk = i_endblk_in[2]; if (l2fields) { - for (int jb = i_startblk; jb < i_endblk; ++jb) { + for (int jb = i_startblk; jb <= i_endblk; ++jb) { int i_startidx, i_endidx; get_indices_c_lib(i_startidx_in[2], i_endidx_in[2], nproma, jb, i_startblk, i_endblk, i_startidx, i_endidx); Kokkos::MDRangePolicy<Kokkos::Rank<2>> innerPolicy({slev, i_startidx}, - {elev, i_endidx}); + {elev + 1, i_endidx + 1}); Kokkos::parallel_for( "div_avg_step5", innerPolicy, KOKKOS_LAMBDA(const int jk, const int jc) { @@ -1224,13 +1209,13 @@ void div_avg(const T *vec_e, const int *cell_neighbor_idx, }); } } else { - for (int jb = i_startblk; jb < i_endblk; ++jb) { + for (int jb = i_startblk; jb <= i_endblk; ++jb) { int i_startidx, i_endidx; get_indices_c_lib(i_startidx_in[2], i_endidx_in[2], nproma, jb, i_startblk, i_endblk, i_startidx, i_endidx); Kokkos::MDRangePolicy<Kokkos::Rank<2>> innerPolicy({slev, i_startidx}, - {elev, i_endidx}); + {elev + 1, i_endidx + 1}); Kokkos::parallel_for( "div_avg_step6", innerPolicy, KOKKOS_LAMBDA(const int jk, const int jc) { @@ -1273,13 +1258,13 @@ void rot_vertex_atmos(const T *vec_e, const int *vert_edge_idx, UnmanagedT3D rot_vec_view(rot_vec, nproma, nlev, nblks_v); - for (int jb = i_startblk; jb < i_endblk; ++jb) { + for (int jb = i_startblk; jb <= i_endblk; ++jb) { int i_startidx, i_endidx; get_indices_v_lib(i_startidx_in, i_endidx_in, nproma, jb, i_startblk, i_endblk, i_startidx, i_endidx); Kokkos::MDRangePolicy<Kokkos::Rank<2>> innerPolicy({slev, i_startidx}, - {elev, i_endidx}); + {elev + 1, i_endidx + 1}); Kokkos::parallel_for( "rot_vertex_atmos_inner", innerPolicy, KOKKOS_LAMBDA(const int jk, const int jv) { @@ -1326,13 +1311,13 @@ void rot_vertex_ri(const T *vec_e, const int *vert_edge_idx, UnmanagedT3D rot_vec_view(rot_vec, nproma, nlev, nblks_v); - for (int jb = i_startblk; jb < i_endblk; ++jb) { + for (int jb = i_startblk; jb <= i_endblk; ++jb) { int i_startidx, i_endidx; get_indices_v_lib(i_startidx_in, i_endidx_in, nproma, jb, i_startblk, i_endblk, i_startidx, i_endidx); Kokkos::MDRangePolicy<Kokkos::Rank<2>> innerPolicy({slev, i_startidx}, - {elev, i_endidx}); + {elev + 1, i_endidx + 1}); Kokkos::parallel_for( "rot_vertex_atmos_inner", innerPolicy, KOKKOS_LAMBDA(const int jk, const int jv) { diff --git a/src/interpolation/mo_lib_interpolation_scalar.cpp b/src/interpolation/mo_lib_interpolation_scalar.cpp index 9e4e6c5ab4a3a531cb0cad254cc46b049c5ee58f..6b761dc99ccbbeaa1849b898856c0fc6821a04ff 100644 --- a/src/interpolation/mo_lib_interpolation_scalar.cpp +++ b/src/interpolation/mo_lib_interpolation_scalar.cpp @@ -52,7 +52,7 @@ void verts2edges_scalar_lib(const T *p_vertex_in, const int *edge_vertex_idx, UnmanagedConstT3D coeff_int_view(coeff_int, nproma, 2, nblks_e); UnmanagedT3D p_edge_out_view(p_edge_out, nproma, nlev, nblks_e); - for (int jb = i_startblk; jb < i_endblk + 1; ++jb) { + for (int jb = i_startblk; jb <= i_endblk; ++jb) { int i_startidx, i_endidx; get_indices_c_lib(i_startidx_in, i_endidx_in, nproma, jb, i_startblk, @@ -117,7 +117,7 @@ void cells2edges_scalar_lib(const T *p_cell_in, const int *edge_cell_idx, i_startblk = i_startblk_in[0]; i_endblk = i_endblk_in[0]; - for (int jb = i_startblk; jb < i_endblk + 1; ++jb) { + for (int jb = i_startblk; jb <= i_endblk; ++jb) { int i_startidx, i_endidx; get_indices_e_lib(i_startidx_in[0], i_endidx_in[0], nproma, jb, @@ -136,10 +136,7 @@ void cells2edges_scalar_lib(const T *p_cell_in, const int *edge_cell_idx, p_edge_out_view(je, jk, jb) = p_cell_in_view( iidx_view(je, jb, 1), jk, iblk_view(je, jb, 1)); } else { - std::cerr << "mo_interpolation:cells2edges_scalar_lib: error in " - "lateral boundary filling" - << std::endl; - std::exit(EXIT_FAILURE); + Kokkos::abort("mo_interpolation:cells2edges_scalar_lib: error in lateral boundary filling"); } }); Kokkos::fence(); @@ -150,7 +147,7 @@ void cells2edges_scalar_lib(const T *p_cell_in, const int *edge_cell_idx, i_startblk = i_startblk_in[1]; i_endblk = i_endblk_in[1]; - for (int jb = i_startblk; jb < i_endblk + 1; ++jb) { + for (int jb = i_startblk; jb <= i_endblk; ++jb) { int i_startidx, i_endidx; get_indices_e_lib(i_startidx_in[1], i_endidx_in[1], nproma, jb, @@ -207,7 +204,7 @@ void edges2verts_scalar_lib(const T *p_edge_in, const int *vert_edge_idx, UnmanagedConstT3D v_int_view(v_int, nproma, 6, nblks_v); UnmanagedT3D p_vert_out_view(p_vert_out, nproma, nlev, nblks_v); - for (int jb = i_startblk; jb < i_endblk + 1; ++jb) { + for (int jb = i_startblk; jb <= i_endblk; ++jb) { int i_startidx, i_endidx; get_indices_v_lib(i_startidx_in, i_endidx_in, nproma, jb, i_startblk, @@ -280,7 +277,7 @@ void edges2cells_scalar_lib(const T *p_edge_in, const int *edge_idx, int i_startidx, i_endidx; - for (int jb = i_startblk; jb < i_endblk + 1; ++jb) { + for (int jb = i_startblk; jb <= i_endblk; ++jb) { get_indices_c_lib(i_startidx_in, i_endidx_in, nproma, jb, i_startblk, i_endblk, i_startidx, i_endidx); @@ -348,7 +345,7 @@ void cells2verts_scalar_lib(const T *p_cell_in, const int *vert_cell_idx, int i_startidx, i_endidx; - for (int jb = i_startblk; jb < i_endblk + 1; ++jb) { + for (int jb = i_startblk; jb <= i_endblk; ++jb) { get_indices_v_lib(i_startidx_in, i_endidx_in, nproma, jb, i_startblk, i_endblk, i_startidx, i_endidx); @@ -429,7 +426,7 @@ void cells2verts_scalar_ri_lib(const T *p_cell_in, const int *vert_cell_idx, int i_startidx, i_endidx; - for (int jb = i_startblk; jb < i_endblk + 1; ++jb) { + for (int jb = i_startblk; jb <= i_endblk; ++jb) { get_indices_v_lib(i_startidx_in, i_endidx_in, nproma, jb, i_startblk, i_endblk, i_startidx, i_endidx); @@ -571,15 +568,15 @@ void cell_avg_lib(const T *psi_c, const int *cell_neighbor_idx, // block indices of triangles next to each cell, dim: (nproma,nblks_c, 3) UnmanagedConstInt3D iblk_view(cell_neighbor_blk, nproma, nblks_c, 3); // cell_neighbour_blk - // averaging coefficients, dim: (nproma,nlev,nblks_c) - UnmanagedConstT3D avg_coeff_view(avg_coeff, nproma, nlev, nblks_c); + // averaging coefficients, dim: (nproma, 4, nblks_c) + UnmanagedConstT3D avg_coeff_view(avg_coeff, nproma, 4, nblks_c); // cell based variable after averaging, dim: (nproma,nlev,nblks_c) UnmanagedT3D avg_psi_c_view(avg_psi_c, nproma, nlev, nblks_c); int i_startidx, i_endidx; - for (int jb = i_startblk; jb < i_endblk + 1; ++jb) { + for (int jb = i_startblk; jb <= i_endblk; ++jb) { get_indices_c_lib(i_startidx_in, i_endidx_in, nproma, jb, i_startblk, i_endblk, i_startidx, i_endidx); diff --git a/src/interpolation/mo_lib_intp_rbf.cpp b/src/interpolation/mo_lib_intp_rbf.cpp index d1178a65397571818db4104ca546ff67eb35d01e..ce6e238f9875ba1dd1dd6621d5f8b2443f56470b 100644 --- a/src/interpolation/mo_lib_intp_rbf.cpp +++ b/src/interpolation/mo_lib_intp_rbf.cpp @@ -180,62 +180,62 @@ void rbf_interpol_c2grad_lib(const T *p_cell_in, const int *rbf_c2grad_idx, "rbf_interpol_c2grad", innerPolicy, KOKKOS_LAMBDA(const int jk, const int jc) { grad_x_view(jc, jk, jb) = - rbf_c2grad_coeff_view(0, 1, jc, jb) * p_cell_in_view(jc, jk, jb) + - rbf_c2grad_coeff_view(1, 1, jc, jb) * + rbf_c2grad_coeff_view(0, 0, jc, jb) * p_cell_in_view(jc, jk, jb) + + rbf_c2grad_coeff_view(1, 0, jc, jb) * p_cell_in_view(rbf_c2grad_idx_view(1, jc, jb), jk, rbf_c2grad_blk_view(1, jc, jb)) + - rbf_c2grad_coeff_view(2, 1, jc, jb) * + rbf_c2grad_coeff_view(2, 0, jc, jb) * p_cell_in_view(rbf_c2grad_idx_view(2, jc, jb), jk, rbf_c2grad_blk_view(2, jc, jb)) + - rbf_c2grad_coeff_view(3, 1, jc, jb) * + rbf_c2grad_coeff_view(3, 0, jc, jb) * p_cell_in_view(rbf_c2grad_idx_view(3, jc, jb), jk, rbf_c2grad_blk_view(3, jc, jb)) + - rbf_c2grad_coeff_view(4, 1, jc, jb) * + rbf_c2grad_coeff_view(4, 0, jc, jb) * p_cell_in_view(rbf_c2grad_idx_view(4, jc, jb), jk, rbf_c2grad_blk_view(4, jc, jb)) + - rbf_c2grad_coeff_view(5, 1, jc, jb) * + rbf_c2grad_coeff_view(5, 0, jc, jb) * p_cell_in_view(rbf_c2grad_idx_view(5, jc, jb), jk, rbf_c2grad_blk_view(5, jc, jb)) + - rbf_c2grad_coeff_view(6, 1, jc, jb) * + rbf_c2grad_coeff_view(6, 0, jc, jb) * p_cell_in_view(rbf_c2grad_idx_view(6, jc, jb), jk, rbf_c2grad_blk_view(6, jc, jb)) + - rbf_c2grad_coeff_view(7, 1, jc, jb) * + rbf_c2grad_coeff_view(7, 0, jc, jb) * p_cell_in_view(rbf_c2grad_idx_view(7, jc, jb), jk, rbf_c2grad_blk_view(7, jc, jb)) + - rbf_c2grad_coeff_view(8, 1, jc, jb) * + rbf_c2grad_coeff_view(8, 0, jc, jb) * p_cell_in_view(rbf_c2grad_idx_view(8, jc, jb), jk, rbf_c2grad_blk_view(8, jc, jb)) + - rbf_c2grad_coeff_view(9, 1, jc, jb) * + rbf_c2grad_coeff_view(9, 0, jc, jb) * p_cell_in_view(rbf_c2grad_idx_view(9, jc, jb), jk, rbf_c2grad_blk_view(9, jc, jb)); grad_y_view(jc, jk, jb) = - rbf_c2grad_coeff_view(0, 2, jc, jb) * p_cell_in_view(jc, jk, jb) + - rbf_c2grad_coeff_view(1, 2, jc, jb) * + rbf_c2grad_coeff_view(0, 1, jc, jb) * p_cell_in_view(jc, jk, jb) + + rbf_c2grad_coeff_view(1, 1, jc, jb) * p_cell_in_view(rbf_c2grad_idx_view(1, jc, jb), jk, rbf_c2grad_blk_view(1, jc, jb)) + - rbf_c2grad_coeff_view(2, 2, jc, jb) * + rbf_c2grad_coeff_view(2, 1, jc, jb) * p_cell_in_view(rbf_c2grad_idx_view(2, jc, jb), jk, rbf_c2grad_blk_view(2, jc, jb)) + - rbf_c2grad_coeff_view(3, 2, jc, jb) * + rbf_c2grad_coeff_view(3, 1, jc, jb) * p_cell_in_view(rbf_c2grad_idx_view(3, jc, jb), jk, rbf_c2grad_blk_view(3, jc, jb)) + - rbf_c2grad_coeff_view(4, 2, jc, jb) * + rbf_c2grad_coeff_view(4, 1, jc, jb) * p_cell_in_view(rbf_c2grad_idx_view(4, jc, jb), jk, rbf_c2grad_blk_view(4, jc, jb)) + - rbf_c2grad_coeff_view(5, 2, jc, jb) * + rbf_c2grad_coeff_view(5, 1, jc, jb) * p_cell_in_view(rbf_c2grad_idx_view(5, jc, jb), jk, rbf_c2grad_blk_view(5, jc, jb)) + - rbf_c2grad_coeff_view(6, 2, jc, jb) * + rbf_c2grad_coeff_view(6, 1, jc, jb) * p_cell_in_view(rbf_c2grad_idx_view(6, jc, jb), jk, rbf_c2grad_blk_view(6, jc, jb)) + - rbf_c2grad_coeff_view(7, 2, jc, jb) * + rbf_c2grad_coeff_view(7, 1, jc, jb) * p_cell_in_view(rbf_c2grad_idx_view(7, jc, jb), jk, rbf_c2grad_blk_view(7, jc, jb)) + - rbf_c2grad_coeff_view(8, 2, jc, jb) * + rbf_c2grad_coeff_view(8, 1, jc, jb) * p_cell_in_view(rbf_c2grad_idx_view(8, jc, jb), jk, rbf_c2grad_blk_view(8, jc, jb)) + - rbf_c2grad_coeff_view(9, 2, jc, jb) * + rbf_c2grad_coeff_view(9, 1, jc, jb) * p_cell_in_view(rbf_c2grad_idx_view(9, jc, jb), jk, rbf_c2grad_blk_view(9, jc, jb)); }); @@ -270,10 +270,10 @@ void rbf_vec_interpol_cell_lib(const T *p_vn_in, const int *rbf_vec_idx_c, nblks_c); UnmanagedConstInt3D rbf_vec_blk_c_view(rbf_vec_blk_c, rbf_vec_dim_c, nproma, nblks_c); - UnmanagedConstT4D rbf_vec_coeff_c_view(rbf_vec_coeff_c, nproma, - nblks_c); // TODO + UnmanagedConstT4D rbf_vec_coeff_c_view(rbf_vec_coeff_c, rbf_vec_dim_c, 2, nproma, + nblks_c); UnmanagedT3D p_u_out_view(p_u_out, nproma, nlev, nblks_c); - UnmanagedT3D p_v_out_view(p_u_out, nproma, nlev, nblks_c); + UnmanagedT3D p_v_out_view(p_v_out, nproma, nlev, nblks_c); for (int jb = i_startblk; jb <= i_endblk; ++jb) { @@ -288,60 +288,60 @@ void rbf_vec_interpol_cell_lib(const T *p_vn_in, const int *rbf_vec_idx_c, "rbf_vec_interpol_cell_lib", innerPolicy, KOKKOS_LAMBDA(const int jk, const int jc) { p_u_out_view(jc, jk, jb) = - rbf_vec_coeff_c_view(0, 1, jc, jb) * + rbf_vec_coeff_c_view(0, 0, jc, jb) * p_vn_in_view(rbf_vec_idx_c_view(0, jc, jb), jk, rbf_vec_blk_c_view(0, jc, jb)) + - rbf_vec_coeff_c_view(1, 1, jc, jb) * + rbf_vec_coeff_c_view(1, 0, jc, jb) * p_vn_in_view(rbf_vec_idx_c_view(1, jc, jb), jk, rbf_vec_blk_c_view(1, jc, jb)) + - rbf_vec_coeff_c_view(2, 1, jc, jb) * + rbf_vec_coeff_c_view(2, 0, jc, jb) * p_vn_in_view(rbf_vec_idx_c_view(2, jc, jb), jk, rbf_vec_blk_c_view(2, jc, jb)) + - rbf_vec_coeff_c_view(3, 1, jc, jb) * + rbf_vec_coeff_c_view(3, 0, jc, jb) * p_vn_in_view(rbf_vec_idx_c_view(3, jc, jb), jk, rbf_vec_blk_c_view(3, jc, jb)) + - rbf_vec_coeff_c_view(4, 1, jc, jb) * + rbf_vec_coeff_c_view(4, 0, jc, jb) * p_vn_in_view(rbf_vec_idx_c_view(4, jc, jb), jk, rbf_vec_blk_c_view(4, jc, jb)) + - rbf_vec_coeff_c_view(5, 1, jc, jb) * + rbf_vec_coeff_c_view(5, 0, jc, jb) * p_vn_in_view(rbf_vec_idx_c_view(5, jc, jb), jk, rbf_vec_blk_c_view(5, jc, jb)) + - rbf_vec_coeff_c_view(6, 1, jc, jb) * + rbf_vec_coeff_c_view(6, 0, jc, jb) * p_vn_in_view(rbf_vec_idx_c_view(6, jc, jb), jk, rbf_vec_blk_c_view(6, jc, jb)) + - rbf_vec_coeff_c_view(7, 1, jc, jb) * + rbf_vec_coeff_c_view(7, 0, jc, jb) * p_vn_in_view(rbf_vec_idx_c_view(7, jc, jb), jk, rbf_vec_blk_c_view(7, jc, jb)) + - rbf_vec_coeff_c_view(8, 1, jc, jb) * + rbf_vec_coeff_c_view(8, 0, jc, jb) * p_vn_in_view(rbf_vec_idx_c_view(8, jc, jb), jk, rbf_vec_blk_c_view(8, jc, jb)); p_v_out_view(jc, jk, jb) = - rbf_vec_coeff_c_view(0, 2, jc, jb) * + rbf_vec_coeff_c_view(0, 1, jc, jb) * p_vn_in_view(rbf_vec_idx_c_view(0, jc, jb), jk, rbf_vec_blk_c_view(0, jc, jb)) + - rbf_vec_coeff_c_view(1, 2, jc, jb) * + rbf_vec_coeff_c_view(1, 1, jc, jb) * p_vn_in_view(rbf_vec_idx_c_view(1, jc, jb), jk, rbf_vec_blk_c_view(1, jc, jb)) + - rbf_vec_coeff_c_view(2, 2, jc, jb) * + rbf_vec_coeff_c_view(2, 1, jc, jb) * p_vn_in_view(rbf_vec_idx_c_view(2, jc, jb), jk, rbf_vec_blk_c_view(2, jc, jb)) + - rbf_vec_coeff_c_view(3, 2, jc, jb) * + rbf_vec_coeff_c_view(3, 1, jc, jb) * p_vn_in_view(rbf_vec_idx_c_view(3, jc, jb), jk, rbf_vec_blk_c_view(3, jc, jb)) + - rbf_vec_coeff_c_view(4, 2, jc, jb) * + rbf_vec_coeff_c_view(4, 1, jc, jb) * p_vn_in_view(rbf_vec_idx_c_view(4, jc, jb), jk, rbf_vec_blk_c_view(4, jc, jb)) + - rbf_vec_coeff_c_view(5, 2, jc, jb) * + rbf_vec_coeff_c_view(5, 1, jc, jb) * p_vn_in_view(rbf_vec_idx_c_view(5, jc, jb), jk, rbf_vec_blk_c_view(5, jc, jb)) + - rbf_vec_coeff_c_view(6, 2, jc, jb) * + rbf_vec_coeff_c_view(6, 1, jc, jb) * p_vn_in_view(rbf_vec_idx_c_view(6, jc, jb), jk, rbf_vec_blk_c_view(6, jc, jb)) + - rbf_vec_coeff_c_view(7, 2, jc, jb) * + rbf_vec_coeff_c_view(7, 1, jc, jb) * p_vn_in_view(rbf_vec_idx_c_view(7, jc, jb), jk, rbf_vec_blk_c_view(7, jc, jb)) + - rbf_vec_coeff_c_view(8, 2, jc, jb) * + rbf_vec_coeff_c_view(8, 1, jc, jb) * p_vn_in_view(rbf_vec_idx_c_view(8, jc, jb), jk, rbf_vec_blk_c_view(8, jc, jb)); }); diff --git a/src/support/mo_lib_loopindices.cpp b/src/support/mo_lib_loopindices.cpp index 30c82bd2e98521f99b09abf9343ee1a5b52f6185..fcc31b6c2a6187f68c431c0f840b42f10fdd154b 100644 --- a/src/support/mo_lib_loopindices.cpp +++ b/src/support/mo_lib_loopindices.cpp @@ -12,21 +12,26 @@ #include <algorithm> // For std::max // get_indices_c_lib function -void get_indices_c_lib(const int i_startidx_in, const int i_endidx_in, const int nproma, +void get_indices_c_lib(const int i_startidx_in, const int i_endidx_in, const int nproma, const int i_blk, const int i_startblk, const int i_endblk, int &i_startidx_out, int &i_endidx_out, const bool called_from_cpp=true) { - + //Since code is ported incrementally from Fortran to C++, depending on where the function is called from //(either fortran or c++), the first index should be either 0 or 1. int first_index; - if (called_from_cpp) + int i_endidx_loc; + if (called_from_cpp){ first_index = 0; - else - first_index = 1; - + i_endidx_loc = nproma - 1; + } + else { + first_index = 1; + i_endidx_loc = nproma; + } + if (i_blk == i_startblk) { i_startidx_out = std::max(first_index, i_startidx_in); - i_endidx_out = nproma; + i_endidx_out = i_endidx_loc; if (i_blk == i_endblk) { i_endidx_out = i_endidx_in; } @@ -35,43 +40,53 @@ void get_indices_c_lib(const int i_startidx_in, const int i_endidx_in, const int i_endidx_out = i_endidx_in; } else { i_startidx_out = first_index; - i_endidx_out = nproma; + i_endidx_out = i_endidx_loc; } } // get_indices_e_lib function -void get_indices_e_lib(const int i_startidx_in, const int i_endidx_in, const int nproma, +void get_indices_e_lib(const int i_startidx_in, const int i_endidx_in, const int nproma, const int i_blk, const int i_startblk, const int i_endblk, int &i_startidx_out, int &i_endidx_out, const bool called_from_cpp=true) { - - //Since code is ported incrementally from Fortran to C++, depending on where the function is called from, + + //Since code is ported incrementally from Fortran to C++, depending on where the function is called from, //the first index should be either 0 or 1. int first_index; - if (called_from_cpp) + int i_endidx_loc; + if (called_from_cpp) { first_index = 0; - else + i_endidx_loc = nproma - 1; + } + else { first_index = 1; + i_endidx_loc = nproma; + } i_startidx_out = (i_blk != i_startblk) ? first_index : std::max(first_index, i_startidx_in); - i_endidx_out = (i_blk != i_endblk) ? nproma : i_endidx_in; + i_endidx_out = (i_blk != i_endblk) ? i_endidx_loc : i_endidx_in; } // get_indices_v_lib function -void get_indices_v_lib(const int i_startidx_in, const int i_endidx_in, const int nproma, +void get_indices_v_lib(const int i_startidx_in, const int i_endidx_in, const int nproma, const int i_blk, const int i_startblk, const int i_endblk, int &i_startidx_out, int &i_endidx_out, const bool called_from_cpp=true) { - - //Since code is ported incrementally from Fortran to C++, depending on where the function is called from, + + //Since code is ported incrementally from Fortran to C++, depending on where the function is called from, //the first index should be either 0 or 1. int first_index; - if (called_from_cpp) + int i_endidx_loc; + if (called_from_cpp) { first_index = 0; - else + i_endidx_loc = nproma - 1; + } + else { first_index = 1; + i_endidx_loc = nproma; + } if (i_blk == i_startblk) { i_startidx_out = i_startidx_in; - i_endidx_out = nproma; + i_endidx_out = i_endidx_loc; if (i_blk == i_endblk) { i_endidx_out = i_endidx_in; } @@ -80,6 +95,6 @@ void get_indices_v_lib(const int i_startidx_in, const int i_endidx_in, const int i_endidx_out = i_endidx_in; } else { i_startidx_out = first_index; - i_endidx_out = nproma; + i_endidx_out = i_endidx_loc; } -} \ No newline at end of file +} diff --git a/test/c/CMakeLists.txt b/test/c/CMakeLists.txt index 90ab1e3d5700b8655779340d97f6cbd3650296c6..c0f7c59a86ae3afe9187dab588239e6983ae0d9b 100644 --- a/test/c/CMakeLists.txt +++ b/test/c/CMakeLists.txt @@ -26,10 +26,6 @@ message(CHECK_PASS "done") # Find Kokkos (or use your existing Kokkos installation) # find_package(Kokkos REQUIRED) -if(IM_ENABLE_LOOP_EXCHANGE) - target_compile_definitions(iconmath-interpolation PRIVATE __LOOP_EXCHANGE) -endif() - set(SOURCES main.cpp test_horizontal_div.cpp @@ -43,6 +39,10 @@ set(SOURCES # Create the test executable from your test files, including main.cpp. add_executable(iconmath_test_c ${SOURCES}) +if(IM_ENABLE_LOOP_EXCHANGE) + target_compile_definitions(iconmath_test_c PRIVATE __LOOP_EXCHANGE) +endif() + target_include_directories(iconmath_test_c PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) # Link the test executable with GoogleTest and Kokkos. diff --git a/test/c/test_horizontal_div.cpp b/test/c/test_horizontal_div.cpp index 596d19e708d6c6e466fa2f23c579cb51f8689e19..ef95f6b72ab4f8ce8ca95dffd1e3f9f21c6f83b9 100644 --- a/test/c/test_horizontal_div.cpp +++ b/test/c/test_horizontal_div.cpp @@ -14,8 +14,8 @@ #include <vector> #include <Kokkos_Core.hpp> -#include <gtest/gtest.h> #include <dim_helper.hpp> +#include <gtest/gtest.h> #include <horizontal/mo_lib_divrot.hpp> #include <support/mo_lib_loopindices.hpp> @@ -29,50 +29,53 @@ protected: static constexpr int dim4d = 2; // 4th dimension size int i_startblk = 0; - int i_endblk = nblks_c; // Test blocks [0 .. nblks_c-1] + int i_endblk = nblks_c - 1; // Test blocks [0 .. nblks_c-1] int i_startidx_in = 0; - int i_endidx_in = nproma; // Full range: 0 .. nproma-1 + int i_endidx_in = nproma - 1; // Full range: 0 .. nproma-1 std::vector<int> slev; std::vector<int> elev; bool lacc = false; // Not using ACC-specific behavior. - std::vector<ValueType> vec_e; - std::vector<int> cell_edge_idx; - std::vector<int> cell_edge_blk; - std::vector<ValueType> geofac_div; - std::vector<ValueType> div_vec_c; - std::vector<ValueType> f4din; - std::vector<ValueType> f4dout; + // Here we allocate Kokkos::View objects in a memory space that is directly + // accessible from both the host and device + using exec_space = Kokkos::DefaultExecutionSpace; + using memory_space = exec_space::memory_space; + + // Views for the test data. All the data is assigned as one-dimensional arrays + Kokkos::View<ValueType *, memory_space> vec_e; + Kokkos::View<int *, memory_space> cell_edge_idx; + Kokkos::View<int *, memory_space> cell_edge_blk; + Kokkos::View<ValueType *, memory_space> geofac_div; + Kokkos::View<ValueType *, memory_space> div_vec_c; + Kokkos::View<ValueType *, memory_space> f4din; + Kokkos::View<ValueType *, memory_space> f4dout; // Followings are needed in HorizontalDivAvgTest - std::vector<int> cell_neighbor_idx; - std::vector<int> cell_neighbor_blk; - std::vector<ValueType> avg_coeff; - std::vector<ValueType> opt_in2; - std::vector<ValueType> opt_out2; - - HorizontalDivTest() { - slev.resize(dim4d, 0); - elev.resize(dim4d, nlev); // Full vertical range (0 .. nlev-1) - - vec_e.resize(dim_combine(nproma, nlev, nblks_e)); - cell_edge_idx.resize(dim_combine(nproma, nblks_c, 3)); - cell_edge_blk.resize(dim_combine(nproma, nblks_c, 3)); - geofac_div.resize(dim_combine(nproma, 3, nblks_c)); - div_vec_c.resize(dim_combine(nproma, nlev, nblks_c)); - f4din.resize(dim_combine(nproma, nlev, nblks_e, dim4d)); - f4dout.resize(dim_combine(nproma, nlev, nblks_c, dim4d)); - cell_neighbor_idx.resize(dim_combine(nproma, nblks_c, 3)); - cell_neighbor_blk.resize(dim_combine(nproma, nblks_c, 3)); - avg_coeff.resize(dim_combine(nproma, 4, nblks_c)); - opt_in2.resize(dim_combine(nproma, nlev, nblks_e)); - opt_out2.resize(dim_combine(nproma, nlev, nblks_c)); - } + Kokkos::View<int *, memory_space> cell_neighbor_idx; + Kokkos::View<int *, memory_space> cell_neighbor_blk; + Kokkos::View<ValueType *, memory_space> avg_coeff; + Kokkos::View<ValueType *, memory_space> opt_in2; + Kokkos::View<ValueType *, memory_space> opt_out2; + + HorizontalDivTest() + : slev(dim4d, 0), + elev(dim4d, nlev - 1), // Full vertical range (0 .. nlev-1) + vec_e("vec_e", dim_combine(nproma, nlev, nblks_e)), + cell_edge_idx("cell_edge_idx", dim_combine(nproma, nblks_c, 3)), + cell_edge_blk("cell_edge_blk", dim_combine(nproma, nblks_c, 3)), + geofac_div("geofac_div", dim_combine(nproma, 3, nblks_c)), + div_vec_c("div_vec_c", dim_combine(nproma, nlev, nblks_c)), + f4din("f4din", dim_combine(nproma, nlev, nblks_e, dim4d)), + f4dout("f4dout", dim_combine(nproma, nlev, nblks_c, dim4d)), + cell_neighbor_idx("cell_neighbor_idx", dim_combine(nproma, nblks_c, 3)), + cell_neighbor_blk("cell_neighbor_blk", dim_combine(nproma, nblks_c, 3)), + avg_coeff("avg_coeff", dim_combine(nproma, 4, nblks_c)), + opt_in2("opt_in2", dim_combine(nproma, nlev, nblks_e)), + opt_out2("opt_out2", dim_combine(nproma, nlev, nblks_c)) {} }; /// ValueTypes which the divrot tests should run with typedef ::testing::Types<float, double> ValueTypes; - TYPED_TEST_SUITE(HorizontalDivTest, ValueTypes); TYPED_TEST(HorizontalDivTest, TestDiv3DSpecific) { @@ -86,34 +89,46 @@ TYPED_TEST(HorizontalDivTest, TestDiv3DSpecific) { const auto &geofac_div_at = at<nproma, 3, nblks_c>; const auto &div_vec_c_at = at<nproma, nlev, nblks_c>; - // Initialization with specific values + // create mirror views to store data on the host + auto vec_e_h = Kokkos::create_mirror_view(this->vec_e); + auto cell_edge_idx_h = Kokkos::create_mirror_view(this->cell_edge_idx); + auto cell_edge_blk_h = Kokkos::create_mirror_view(this->cell_edge_blk); + auto geofac_div_h = Kokkos::create_mirror_view(this->geofac_div); + auto div_vec_c_h = Kokkos::create_mirror_view(this->div_vec_c); + + // Initialize the arrays with the same patterns as before. for (int i = 0; i < nproma; ++i) { for (int k = 0; k < nlev; ++k) { - this->vec_e[vec_e_at(i, k, 0)] = (i + 1) * (k + 1); // Simple pattern + vec_e_h[vec_e_at(i, k, 0)] = static_cast<TypeParam>((i + 1) * (k + 1)); } - // Set edge indices to point to specific cells (including self) - this->cell_edge_idx[cell_edge_at(i, 0, 0)] = i; - this->cell_edge_idx[cell_edge_at(i, 0, 1)] = (i + 1) % nproma; - this->cell_edge_idx[cell_edge_at(i, 0, 2)] = (i + 2) % nproma; + cell_edge_idx_h[cell_edge_at(i, 0, 0)] = i; + cell_edge_idx_h[cell_edge_at(i, 0, 1)] = (i + 1) % nproma; + cell_edge_idx_h[cell_edge_at(i, 0, 2)] = (i + 2) % nproma; // All edges are in the same block for this test for (int j = 0; j < 3; ++j) { - this->cell_edge_blk[cell_edge_at(i, 0, j)] = 0; + cell_edge_blk_h[cell_edge_at(i, 0, j)] = 0; } - // Geometric factors - this->geofac_div[geofac_div_at(i, 0, 0)] = 0.5; - this->geofac_div[geofac_div_at(i, 1, 0)] = 0.3; - this->geofac_div[geofac_div_at(i, 2, 0)] = 0.2; + geofac_div_h[geofac_div_at(i, 0, 0)] = static_cast<TypeParam>(0.5); + geofac_div_h[geofac_div_at(i, 1, 0)] = static_cast<TypeParam>(0.3); + geofac_div_h[geofac_div_at(i, 2, 0)] = static_cast<TypeParam>(0.2); // Initialize div_vec_c to zero for (int k = 0; k < nlev; ++k) { - this->div_vec_c[div_vec_c_at(i, k, 0)] = 0.0; + div_vec_c_h[div_vec_c_at(i, k, 0)] = static_cast<TypeParam>(0.0); } } - - // Call the div3d function + // Copy the initialized data back to the device memory (or unified memory, + // which in some cases may be a no-op if already accessible on the host). + Kokkos::deep_copy(this->vec_e, vec_e_h); + Kokkos::deep_copy(this->cell_edge_idx, cell_edge_idx_h); + Kokkos::deep_copy(this->cell_edge_blk, cell_edge_blk_h); + Kokkos::deep_copy(this->geofac_div, geofac_div_h); + Kokkos::deep_copy(this->div_vec_c, div_vec_c_h); + + // Call the div3d function using the device pointers from the Views. div3d<TypeParam>(this->vec_e.data(), this->cell_edge_idx.data(), this->cell_edge_blk.data(), this->geofac_div.data(), this->div_vec_c.data(), this->i_startblk, this->i_endblk, @@ -121,12 +136,20 @@ TYPED_TEST(HorizontalDivTest, TestDiv3DSpecific) { this->elev[0], this->nproma, this->lacc, this->nlev, this->nblks_c, this->nblks_e); - EXPECT_NEAR(this->div_vec_c[div_vec_c_at(0, 0, 0)], 1.7, 1e-6); - EXPECT_NEAR(this->div_vec_c[div_vec_c_at(0, 1, 0)], 3.4, 1e-6); - EXPECT_NEAR(this->div_vec_c[div_vec_c_at(1, 0, 0)], 2.1, 1e-6); - EXPECT_NEAR(this->div_vec_c[div_vec_c_at(1, 1, 0)], 4.2, 1e-6); - EXPECT_NEAR(this->div_vec_c[div_vec_c_at(2, 0, 0)], 2.2, 1e-6); - EXPECT_NEAR(this->div_vec_c[div_vec_c_at(2, 1, 0)], 4.4, 1e-6); + Kokkos::deep_copy(div_vec_c_h, this->div_vec_c); + + EXPECT_NEAR(div_vec_c_h(div_vec_c_at(0, 0, 0)), static_cast<TypeParam>(1.7), + 1e-6); + EXPECT_NEAR(div_vec_c_h(div_vec_c_at(0, 1, 0)), static_cast<TypeParam>(3.4), + 1e-6); + EXPECT_NEAR(div_vec_c_h(div_vec_c_at(1, 0, 0)), static_cast<TypeParam>(2.1), + 1e-6); + EXPECT_NEAR(div_vec_c_h(div_vec_c_at(1, 1, 0)), static_cast<TypeParam>(4.2), + 1e-6); + EXPECT_NEAR(div_vec_c_h(div_vec_c_at(2, 0, 0)), static_cast<TypeParam>(2.2), + 1e-6); + EXPECT_NEAR(div_vec_c_h(div_vec_c_at(2, 1, 0)), static_cast<TypeParam>(4.4), + 1e-6); } TYPED_TEST(HorizontalDivTest, TestDiv3DRandom) { @@ -140,37 +163,44 @@ TYPED_TEST(HorizontalDivTest, TestDiv3DRandom) { const auto &geofac_div_at = at<nproma, 3, nblks_c>; const auto &div_vec_c_at = at<nproma, nlev, nblks_c>; - // Set up random number generators + // create mirror views to store data on the host + auto vec_e_h = Kokkos::create_mirror_view(this->vec_e); + auto cell_edge_idx_h = Kokkos::create_mirror_view(this->cell_edge_idx); + auto cell_edge_blk_h = Kokkos::create_mirror_view(this->cell_edge_blk); + auto geofac_div_h = Kokkos::create_mirror_view(this->geofac_div); + auto div_vec_c_h = Kokkos::create_mirror_view(this->div_vec_c); + + // Initialize the arrays with random values. std::random_device rd; std::mt19937 gen(rd()); + std::uniform_real_distribution<TypeParam> real_distrib(-1.0, 1.0); std::uniform_int_distribution<int> int_distrib(0, nproma - 1); - std::uniform_real_distribution<TypeParam> real_distrib(-10.0, 10.0); - // Initialization with random values for (int i = 0; i < nproma; ++i) { for (int k = 0; k < nlev; ++k) { - this->vec_e[vec_e_at(i, k, 0)] = real_distrib(gen); + vec_e_h[vec_e_at(i, k, 0)] = real_distrib(gen); } - // Set random edge indices for (int j = 0; j < 3; ++j) { - this->cell_edge_idx[cell_edge_at(i, 0, j)] = int_distrib(gen); - this->cell_edge_blk[cell_edge_at(i, 0, j)] = - 0; // Keep in same block for simplicity + cell_edge_idx_h[cell_edge_at(i, 0, j)] = int_distrib(gen); + cell_edge_blk_h[cell_edge_at(i, 0, j)] = 0; } - // Random geometric factors for (int j = 0; j < 3; ++j) { - this->geofac_div[geofac_div_at(i, j, 0)] = real_distrib(gen); + geofac_div_h[geofac_div_at(i, j, 0)] = real_distrib(gen); } - // Initialize div_vec_c to random values for (int k = 0; k < nlev; ++k) { - this->div_vec_c[div_vec_c_at(i, k, 0)] = real_distrib(gen); + div_vec_c_h[div_vec_c_at(i, k, 0)] = real_distrib(gen); } } - // Call the div3d function + Kokkos::deep_copy(this->vec_e, vec_e_h); + Kokkos::deep_copy(this->cell_edge_idx, cell_edge_idx_h); + Kokkos::deep_copy(this->cell_edge_blk, cell_edge_blk_h); + Kokkos::deep_copy(this->geofac_div, geofac_div_h); + Kokkos::deep_copy(this->div_vec_c, div_vec_c_h); + div3d<TypeParam>(this->vec_e.data(), this->cell_edge_idx.data(), this->cell_edge_blk.data(), this->geofac_div.data(), this->div_vec_c.data(), this->i_startblk, this->i_endblk, @@ -178,38 +208,40 @@ TYPED_TEST(HorizontalDivTest, TestDiv3DRandom) { this->elev[0], this->nproma, this->lacc, this->nlev, this->nblks_c, this->nblks_e); + Kokkos::deep_copy(div_vec_c_h, this->div_vec_c); + // Calculate reference values separately and verify results std::vector<TypeParam> ref_div_vec_c(nproma * nlev * nblks_c, 0.0); - for (int jb = this->i_startblk; jb < this->i_endblk; ++jb) { + for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) { int i_startidx, i_endidx; get_indices_c_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb, this->i_startblk, this->i_endblk, i_startidx, i_endidx); - for (int jk = this->slev[0]; jk < this->elev[0]; ++jk) { - for (int jc = i_startidx; jc < i_endidx; ++jc) { + for (int jk = this->slev[0]; jk <= this->elev[0]; ++jk) { + for (int jc = i_startidx; jc <= i_endidx; ++jc) { ref_div_vec_c[div_vec_c_at(jc, jk, jb)] = - this->vec_e[vec_e_at( - this->cell_edge_idx[cell_edge_at(jc, jb, 0)], jk, - this->cell_edge_blk[cell_edge_at(jc, jb, 0)])] * - this->geofac_div[geofac_div_at(jc, 0, jb)] + - this->vec_e[vec_e_at( - this->cell_edge_idx[cell_edge_at(jc, jb, 1)], jk, - this->cell_edge_blk[cell_edge_at(jc, jb, 1)])] * - this->geofac_div[geofac_div_at(jc, 1, jb)] + - this->vec_e[vec_e_at( - this->cell_edge_idx[cell_edge_at(jc, jb, 2)], jk, - this->cell_edge_blk[cell_edge_at(jc, jb, 2)])] * - this->geofac_div[geofac_div_at(jc, 2, jb)]; + vec_e_h[vec_e_at(cell_edge_idx_h[cell_edge_at(jc, jb, 0)], jk, + cell_edge_blk_h[cell_edge_at(jc, jb, 0)])] * + geofac_div_h[geofac_div_at(jc, 0, jb)] + + vec_e_h[vec_e_at(cell_edge_idx_h[cell_edge_at(jc, jb, 1)], jk, + cell_edge_blk_h[cell_edge_at(jc, jb, 1)])] * + geofac_div_h[geofac_div_at(jc, 1, jb)] + + vec_e_h[vec_e_at(cell_edge_idx_h[cell_edge_at(jc, jb, 2)], jk, + cell_edge_blk_h[cell_edge_at(jc, jb, 2)])] * + geofac_div_h[geofac_div_at(jc, 2, jb)]; } } } + TypeParam tol = std::is_same<TypeParam, float>::value ? + static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13); + // Verify results for (int i = 0; i < nproma; ++i) { for (int k = 0; k < nlev; ++k) { - EXPECT_NEAR(this->div_vec_c[div_vec_c_at(i, k, 0)], - ref_div_vec_c[div_vec_c_at(i, k, 0)], 1e-5) + EXPECT_NEAR(div_vec_c_h[div_vec_c_at(i, k, 0)], + ref_div_vec_c[div_vec_c_at(i, k, 0)], tol) << "Results differ at i=" << i << ", k=" << k; } } @@ -229,36 +261,55 @@ TYPED_TEST(HorizontalDivTest, TestDiv3D2FSpecific) { const auto &f4d_at = at<nproma, nlev, nblks_e, dim4d>; const auto &f4dout_at = at<nproma, nlev, nblks_c, dim4d>; + // Create mirror views to store data on the host + auto vec_e_h = Kokkos::create_mirror_view(this->vec_e); + auto cell_edge_idx_h = Kokkos::create_mirror_view(this->cell_edge_idx); + auto cell_edge_blk_h = Kokkos::create_mirror_view(this->cell_edge_blk); + auto geofac_div_h = Kokkos::create_mirror_view(this->geofac_div); + auto div_vec_c_h = Kokkos::create_mirror_view(this->div_vec_c); + auto f4din_h = Kokkos::create_mirror_view(this->f4din); + auto f4dout_h = Kokkos::create_mirror_view(this->f4dout); + // Initialization with specific values for (int i = 0; i < nproma; ++i) { for (int k = 0; k < nlev; ++k) { - this->vec_e[vec_e_at(i, k, 0)] = (i + 1) * (k + 1); // Simple pattern - this->f4din[f4d_at(i, k, 0, 0)] = - (i + 1) * (k + 2); // Different pattern for second field + vec_e_h[vec_e_at(i, k, 0)] = + static_cast<TypeParam>((i + 1) * (k + 1)); // Simple pattern + f4din_h[f4d_at(i, k, 0, 0)] = static_cast<TypeParam>( + (i + 1) * (k + 2)); // Different pattern for second field } // Set edge indices to point to specific cells (including self) - this->cell_edge_idx[cell_edge_at(i, 0, 0)] = i; - this->cell_edge_idx[cell_edge_at(i, 0, 1)] = (i + 1) % nproma; - this->cell_edge_idx[cell_edge_at(i, 0, 2)] = (i + 2) % nproma; + cell_edge_idx_h[cell_edge_at(i, 0, 0)] = i; + cell_edge_idx_h[cell_edge_at(i, 0, 1)] = (i + 1) % nproma; + cell_edge_idx_h[cell_edge_at(i, 0, 2)] = (i + 2) % nproma; // All edges are in the same block for this test for (int j = 0; j < 3; ++j) { - this->cell_edge_blk[cell_edge_at(i, 0, j)] = 0; + cell_edge_blk_h[cell_edge_at(i, 0, j)] = 0; } // Geometric factors - this->geofac_div[geofac_div_at(i, 0, 0)] = 0.5; - this->geofac_div[geofac_div_at(i, 1, 0)] = 0.3; - this->geofac_div[geofac_div_at(i, 2, 0)] = 0.2; + geofac_div_h[geofac_div_at(i, 0, 0)] = static_cast<TypeParam>(0.5); + geofac_div_h[geofac_div_at(i, 1, 0)] = static_cast<TypeParam>(0.3); + geofac_div_h[geofac_div_at(i, 2, 0)] = static_cast<TypeParam>(0.2); // Initialize div_vec_c and f4dout to zero for (int k = 0; k < nlev; ++k) { - this->div_vec_c[div_vec_c_at(i, k, 0)] = 0.0; - this->f4dout[f4dout_at(i, k, 0, 0)] = 0.0; + div_vec_c_h[div_vec_c_at(i, k, 0)] = static_cast<TypeParam>(0.0); + f4dout_h[f4dout_at(i, k, 0, 0)] = static_cast<TypeParam>(0.0); } } + // Copy initialized data to device + Kokkos::deep_copy(this->vec_e, vec_e_h); + Kokkos::deep_copy(this->cell_edge_idx, cell_edge_idx_h); + Kokkos::deep_copy(this->cell_edge_blk, cell_edge_blk_h); + Kokkos::deep_copy(this->geofac_div, geofac_div_h); + Kokkos::deep_copy(this->div_vec_c, div_vec_c_h); + Kokkos::deep_copy(this->f4din, f4din_h); + Kokkos::deep_copy(this->f4dout, f4dout_h); + // Call the div3d_2field function div3d_2field<TypeParam>(this->vec_e.data(), this->cell_edge_idx.data(), this->cell_edge_blk.data(), this->geofac_div.data(), @@ -268,21 +319,37 @@ TYPED_TEST(HorizontalDivTest, TestDiv3D2FSpecific) { this->elev[0], this->nproma, this->lacc, this->nlev, this->nblks_c, this->nblks_e); + // Copy results back to host for verification + Kokkos::deep_copy(div_vec_c_h, this->div_vec_c); + Kokkos::deep_copy(f4dout_h, this->f4dout); + // Check first field (same as in div3d test) - EXPECT_NEAR(this->div_vec_c[div_vec_c_at(0, 0, 0)], 1.7, 1e-6); - EXPECT_NEAR(this->div_vec_c[div_vec_c_at(0, 1, 0)], 3.4, 1e-6); - EXPECT_NEAR(this->div_vec_c[div_vec_c_at(1, 0, 0)], 2.1, 1e-6); - EXPECT_NEAR(this->div_vec_c[div_vec_c_at(1, 1, 0)], 4.2, 1e-6); - EXPECT_NEAR(this->div_vec_c[div_vec_c_at(2, 0, 0)], 2.2, 1e-6); - EXPECT_NEAR(this->div_vec_c[div_vec_c_at(2, 1, 0)], 4.4, 1e-6); + EXPECT_NEAR(div_vec_c_h[div_vec_c_at(0, 0, 0)], static_cast<TypeParam>(1.7), + 1e-6); + EXPECT_NEAR(div_vec_c_h[div_vec_c_at(0, 1, 0)], static_cast<TypeParam>(3.4), + 1e-6); + EXPECT_NEAR(div_vec_c_h[div_vec_c_at(1, 0, 0)], static_cast<TypeParam>(2.1), + 1e-6); + EXPECT_NEAR(div_vec_c_h[div_vec_c_at(1, 1, 0)], static_cast<TypeParam>(4.2), + 1e-6); + EXPECT_NEAR(div_vec_c_h[div_vec_c_at(2, 0, 0)], static_cast<TypeParam>(2.2), + 1e-6); + EXPECT_NEAR(div_vec_c_h[div_vec_c_at(2, 1, 0)], static_cast<TypeParam>(4.4), + 1e-6); // Check second field (expected values calculated manually) - EXPECT_NEAR(this->f4dout[f4dout_at(0, 0, 0, 0)], 3.4, 1e-6); - EXPECT_NEAR(this->f4dout[f4dout_at(0, 1, 0, 0)], 5.1, 1e-6); - EXPECT_NEAR(this->f4dout[f4dout_at(1, 0, 0, 0)], 4.2, 1e-6); - EXPECT_NEAR(this->f4dout[f4dout_at(1, 1, 0, 0)], 6.3, 1e-6); - EXPECT_NEAR(this->f4dout[f4dout_at(2, 0, 0, 0)], 4.4, 1e-6); - EXPECT_NEAR(this->f4dout[f4dout_at(2, 1, 0, 0)], 6.6, 1e-6); + EXPECT_NEAR(f4dout_h[f4dout_at(0, 0, 0, 0)], static_cast<TypeParam>(3.4), + 1e-6); + EXPECT_NEAR(f4dout_h[f4dout_at(0, 1, 0, 0)], static_cast<TypeParam>(5.1), + 1e-6); + EXPECT_NEAR(f4dout_h[f4dout_at(1, 0, 0, 0)], static_cast<TypeParam>(4.2), + 1e-6); + EXPECT_NEAR(f4dout_h[f4dout_at(1, 1, 0, 0)], static_cast<TypeParam>(6.3), + 1e-6); + EXPECT_NEAR(f4dout_h[f4dout_at(2, 0, 0, 0)], static_cast<TypeParam>(4.4), + 1e-6); + EXPECT_NEAR(f4dout_h[f4dout_at(2, 1, 0, 0)], static_cast<TypeParam>(6.6), + 1e-6); } TYPED_TEST(HorizontalDivTest, TestDiv3D2FRandom) { @@ -299,38 +366,56 @@ TYPED_TEST(HorizontalDivTest, TestDiv3D2FRandom) { const auto &f4d_at = at<nproma, nlev, nblks_e, dim4d>; const auto &f4dout_at = at<nproma, nlev, nblks_c, dim4d>; + // Create mirror views to store data on the host + auto vec_e_h = Kokkos::create_mirror_view(this->vec_e); + auto cell_edge_idx_h = Kokkos::create_mirror_view(this->cell_edge_idx); + auto cell_edge_blk_h = Kokkos::create_mirror_view(this->cell_edge_blk); + auto geofac_div_h = Kokkos::create_mirror_view(this->geofac_div); + auto div_vec_c_h = Kokkos::create_mirror_view(this->div_vec_c); + auto f4din_h = Kokkos::create_mirror_view(this->f4din); + auto f4dout_h = Kokkos::create_mirror_view(this->f4dout); + // Set up random number generators std::random_device rd; std::mt19937 gen(rd()); std::uniform_int_distribution<int> int_distrib(0, nproma - 1); - std::uniform_real_distribution<TypeParam> real_distrib(-10.0, 10.0); + std::uniform_real_distribution<TypeParam> real_distrib(-1.0, 1.0); // Initialization with random values for (int i = 0; i < nproma; ++i) { for (int k = 0; k < nlev; ++k) { - this->vec_e[vec_e_at(i, k, 0)] = real_distrib(gen); - this->f4din[f4d_at(i, k, 0, 0)] = real_distrib(gen); + vec_e_h[vec_e_at(i, k, 0)] = real_distrib(gen); + f4din_h[f4d_at(i, k, 0, 0)] = real_distrib(gen); } // Set random edge indices for (int j = 0; j < 3; ++j) { - this->cell_edge_idx[cell_edge_at(i, 0, j)] = int_distrib(gen); - this->cell_edge_blk[cell_edge_at(i, 0, j)] = + cell_edge_idx_h[cell_edge_at(i, 0, j)] = int_distrib(gen); + cell_edge_blk_h[cell_edge_at(i, 0, j)] = 0; // Keep in same block for simplicity } // Random geometric factors for (int j = 0; j < 3; ++j) { - this->geofac_div[geofac_div_at(i, j, 0)] = real_distrib(gen); + geofac_div_h[geofac_div_at(i, j, 0)] = real_distrib(gen); } // Initialize div_vec_c and f4dout to random values for (int k = 0; k < nlev; ++k) { - this->div_vec_c[div_vec_c_at(i, k, 0)] = real_distrib(gen); - this->f4dout[f4dout_at(i, k, 0, 0)] = real_distrib(gen); + div_vec_c_h[div_vec_c_at(i, k, 0)] = real_distrib(gen); + f4dout_h[f4dout_at(i, k, 0, 0)] = real_distrib(gen); } } + // Copy initialized data to device + Kokkos::deep_copy(this->vec_e, vec_e_h); + Kokkos::deep_copy(this->cell_edge_idx, cell_edge_idx_h); + Kokkos::deep_copy(this->cell_edge_blk, cell_edge_blk_h); + Kokkos::deep_copy(this->geofac_div, geofac_div_h); + Kokkos::deep_copy(this->div_vec_c, div_vec_c_h); + Kokkos::deep_copy(this->f4din, f4din_h); + Kokkos::deep_copy(this->f4dout, f4dout_h); + // Call the div3d_2field function div3d_2field<TypeParam>(this->vec_e.data(), this->cell_edge_idx.data(), this->cell_edge_blk.data(), this->geofac_div.data(), @@ -340,55 +425,56 @@ TYPED_TEST(HorizontalDivTest, TestDiv3D2FRandom) { this->elev[0], this->nproma, this->lacc, this->nlev, this->nblks_c, this->nblks_e); + // Copy results back to host for verification + Kokkos::deep_copy(div_vec_c_h, this->div_vec_c); + Kokkos::deep_copy(f4dout_h, this->f4dout); + // Calculate reference values separately and verify results std::vector<TypeParam> ref_div_vec_c(nproma * nlev * nblks_c, 0.0); std::vector<TypeParam> ref_f4dout(nproma * nlev * nblks_c * dim4d, 0.0); - for (int jb = this->i_startblk; jb < this->i_endblk; ++jb) { + for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) { int i_startidx, i_endidx; get_indices_c_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb, this->i_startblk, this->i_endblk, i_startidx, i_endidx); - for (int jk = this->slev[0]; jk < this->elev[0]; ++jk) { - for (int jc = i_startidx; jc < i_endidx; ++jc) { + for (int jk = this->slev[0]; jk <= this->elev[0]; ++jk) { + for (int jc = i_startidx; jc <= i_endidx; ++jc) { // Calculate reference value for first field ref_div_vec_c[div_vec_c_at(jc, jk, jb)] = - this->vec_e[vec_e_at( - this->cell_edge_idx[cell_edge_at(jc, jb, 0)], jk, - this->cell_edge_blk[cell_edge_at(jc, jb, 0)])] * - this->geofac_div[geofac_div_at(jc, 0, jb)] + - this->vec_e[vec_e_at( - this->cell_edge_idx[cell_edge_at(jc, jb, 1)], jk, - this->cell_edge_blk[cell_edge_at(jc, jb, 1)])] * - this->geofac_div[geofac_div_at(jc, 1, jb)] + - this->vec_e[vec_e_at( - this->cell_edge_idx[cell_edge_at(jc, jb, 2)], jk, - this->cell_edge_blk[cell_edge_at(jc, jb, 2)])] * - this->geofac_div[geofac_div_at(jc, 2, jb)]; + vec_e_h[vec_e_at(cell_edge_idx_h[cell_edge_at(jc, jb, 0)], jk, + cell_edge_blk_h[cell_edge_at(jc, jb, 0)])] * + geofac_div_h[geofac_div_at(jc, 0, jb)] + + vec_e_h[vec_e_at(cell_edge_idx_h[cell_edge_at(jc, jb, 1)], jk, + cell_edge_blk_h[cell_edge_at(jc, jb, 1)])] * + geofac_div_h[geofac_div_at(jc, 1, jb)] + + vec_e_h[vec_e_at(cell_edge_idx_h[cell_edge_at(jc, jb, 2)], jk, + cell_edge_blk_h[cell_edge_at(jc, jb, 2)])] * + geofac_div_h[geofac_div_at(jc, 2, jb)]; // Calculate reference value for second field ref_f4dout[f4dout_at(jc, jk, jb, 0)] = - this->f4din[f4d_at(this->cell_edge_idx[cell_edge_at(jc, jb, 0)], jk, - this->cell_edge_blk[cell_edge_at(jc, jb, 0)], - 0)] * - this->geofac_div[geofac_div_at(jc, 0, jb)] + - this->f4din[f4d_at(this->cell_edge_idx[cell_edge_at(jc, jb, 1)], jk, - this->cell_edge_blk[cell_edge_at(jc, jb, 1)], - 0)] * - this->geofac_div[geofac_div_at(jc, 1, jb)] + - this->f4din[f4d_at(this->cell_edge_idx[cell_edge_at(jc, jb, 2)], jk, - this->cell_edge_blk[cell_edge_at(jc, jb, 2)], - 0)] * - this->geofac_div[geofac_div_at(jc, 2, jb)]; + f4din_h[f4d_at(cell_edge_idx_h[cell_edge_at(jc, jb, 0)], jk, + cell_edge_blk_h[cell_edge_at(jc, jb, 0)], 0)] * + geofac_div_h[geofac_div_at(jc, 0, jb)] + + f4din_h[f4d_at(cell_edge_idx_h[cell_edge_at(jc, jb, 1)], jk, + cell_edge_blk_h[cell_edge_at(jc, jb, 1)], 0)] * + geofac_div_h[geofac_div_at(jc, 1, jb)] + + f4din_h[f4d_at(cell_edge_idx_h[cell_edge_at(jc, jb, 2)], jk, + cell_edge_blk_h[cell_edge_at(jc, jb, 2)], 0)] * + geofac_div_h[geofac_div_at(jc, 2, jb)]; } } } + TypeParam tol = std::is_same<TypeParam, float>::value ? + static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13); + // Verify results for first field for (int i = 0; i < nproma; ++i) { for (int k = 0; k < nlev; ++k) { - EXPECT_NEAR(this->div_vec_c[div_vec_c_at(i, k, 0)], - ref_div_vec_c[div_vec_c_at(i, k, 0)], 1e-5) + EXPECT_NEAR(div_vec_c_h[div_vec_c_at(i, k, 0)], + ref_div_vec_c[div_vec_c_at(i, k, 0)], tol) << "First field results differ at i=" << i << ", k=" << k; } } @@ -396,8 +482,8 @@ TYPED_TEST(HorizontalDivTest, TestDiv3D2FRandom) { // Verify results for second field for (int i = 0; i < nproma; ++i) { for (int k = 0; k < nlev; ++k) { - EXPECT_NEAR(this->f4dout[f4dout_at(i, k, 0, 0)], - ref_f4dout[f4dout_at(i, k, 0, 0)], 1e-5) + EXPECT_NEAR(f4dout_h[f4dout_at(i, k, 0, 0)], + ref_f4dout[f4dout_at(i, k, 0, 0)], tol) << "Second field results differ at i=" << i << ", k=" << k; } } @@ -415,22 +501,37 @@ TYPED_TEST(HorizontalDivTest, TestDiv4DSpecific) { const auto &f4din_at = at<nproma, nlev, nblks_e, dim4d>; const auto &f4dout_at = at<nproma, nlev, nblks_c, dim4d>; + // Create mirror views to store data on the host + auto cell_edge_idx_h = Kokkos::create_mirror_view(this->cell_edge_idx); + auto cell_edge_blk_h = Kokkos::create_mirror_view(this->cell_edge_blk); + auto geofac_div_h = Kokkos::create_mirror_view(this->geofac_div); + auto f4din_h = Kokkos::create_mirror_view(this->f4din); + auto f4dout_h = Kokkos::create_mirror_view(this->f4dout); + // Initialization for (int i = 0; i < nproma; ++i) { for (int j = 0; j < 3; ++j) { - this->cell_edge_idx[cell_edge_at(i, 0, j)] = (i + j) % nproma; - this->cell_edge_blk[cell_edge_at(i, 0, j)] = 0; - this->geofac_div[geofac_div_at(i, j, 0)] = 0.1 * (j + 1); + cell_edge_idx_h[cell_edge_at(i, 0, j)] = (i + j) % nproma; + cell_edge_blk_h[cell_edge_at(i, 0, j)] = 0; + geofac_div_h[geofac_div_at(i, j, 0)] = + static_cast<TypeParam>(0.1 * (j + 1)); } for (int k = 0; k < nlev; ++k) { for (int d = 0; d < dim4d; ++d) { - this->f4din[f4din_at(i, k, 0, d)] = 1.0 + i + k + d; - this->f4dout[f4dout_at(i, k, 0, d)] = 0.0; + f4din_h[f4din_at(i, k, 0, d)] = static_cast<TypeParam>(1.0 + i + k + d); + f4dout_h[f4dout_at(i, k, 0, d)] = static_cast<TypeParam>(0.0); } } } + // Copy initialized data to device + Kokkos::deep_copy(this->cell_edge_idx, cell_edge_idx_h); + Kokkos::deep_copy(this->cell_edge_blk, cell_edge_blk_h); + Kokkos::deep_copy(this->geofac_div, geofac_div_h); + Kokkos::deep_copy(this->f4din, f4din_h); + Kokkos::deep_copy(this->f4dout, f4dout_h); + // Test function div4d<TypeParam>(this->cell_edge_idx.data(), this->cell_edge_blk.data(), this->geofac_div.data(), this->f4din.data(), @@ -439,18 +540,33 @@ TYPED_TEST(HorizontalDivTest, TestDiv4DSpecific) { this->slev.data(), this->elev.data(), this->nproma, this->lacc, this->nlev, this->nblks_c, this->nblks_e); - EXPECT_NEAR(this->f4dout[f4dout_at(0, 0, 0, 0)], 1.4, 1e-6); - EXPECT_NEAR(this->f4dout[f4dout_at(1, 0, 0, 0)], 1.1, 1e-6); - EXPECT_NEAR(this->f4dout[f4dout_at(2, 0, 0, 0)], 1.1, 1e-6); - EXPECT_NEAR(this->f4dout[f4dout_at(0, 1, 0, 0)], 2.0, 1e-6); - EXPECT_NEAR(this->f4dout[f4dout_at(1, 1, 0, 0)], 1.7, 1e-6); - EXPECT_NEAR(this->f4dout[f4dout_at(2, 1, 0, 0)], 1.7, 1e-6); - EXPECT_NEAR(this->f4dout[f4dout_at(0, 0, 0, 1)], 2.0, 1e-6); - EXPECT_NEAR(this->f4dout[f4dout_at(1, 0, 0, 1)], 1.7, 1e-6); - EXPECT_NEAR(this->f4dout[f4dout_at(2, 0, 0, 1)], 1.7, 1e-6); - EXPECT_NEAR(this->f4dout[f4dout_at(0, 1, 0, 1)], 2.6, 1e-6); - EXPECT_NEAR(this->f4dout[f4dout_at(1, 1, 0, 1)], 2.3, 1e-6); - EXPECT_NEAR(this->f4dout[f4dout_at(2, 1, 0, 1)], 2.3, 1e-6); + // Copy results back to host for verification + Kokkos::deep_copy(f4dout_h, this->f4dout); + + EXPECT_NEAR(f4dout_h[f4dout_at(0, 0, 0, 0)], static_cast<TypeParam>(1.4), + 1e-6); + EXPECT_NEAR(f4dout_h[f4dout_at(1, 0, 0, 0)], static_cast<TypeParam>(1.1), + 1e-6); + EXPECT_NEAR(f4dout_h[f4dout_at(2, 0, 0, 0)], static_cast<TypeParam>(1.1), + 1e-6); + EXPECT_NEAR(f4dout_h[f4dout_at(0, 1, 0, 0)], static_cast<TypeParam>(2.0), + 1e-6); + EXPECT_NEAR(f4dout_h[f4dout_at(1, 1, 0, 0)], static_cast<TypeParam>(1.7), + 1e-6); + EXPECT_NEAR(f4dout_h[f4dout_at(2, 1, 0, 0)], static_cast<TypeParam>(1.7), + 1e-6); + EXPECT_NEAR(f4dout_h[f4dout_at(0, 0, 0, 1)], static_cast<TypeParam>(2.0), + 1e-6); + EXPECT_NEAR(f4dout_h[f4dout_at(1, 0, 0, 1)], static_cast<TypeParam>(1.7), + 1e-6); + EXPECT_NEAR(f4dout_h[f4dout_at(2, 0, 0, 1)], static_cast<TypeParam>(1.7), + 1e-6); + EXPECT_NEAR(f4dout_h[f4dout_at(0, 1, 0, 1)], static_cast<TypeParam>(2.6), + 1e-6); + EXPECT_NEAR(f4dout_h[f4dout_at(1, 1, 0, 1)], static_cast<TypeParam>(2.3), + 1e-6); + EXPECT_NEAR(f4dout_h[f4dout_at(2, 1, 0, 1)], static_cast<TypeParam>(2.3), + 1e-6); } TYPED_TEST(HorizontalDivTest, TestDiv4DRandom) { @@ -465,6 +581,13 @@ TYPED_TEST(HorizontalDivTest, TestDiv4DRandom) { const auto &f4din_at = at<nproma, nlev, nblks_e, dim4d>; const auto &f4dout_at = at<nproma, nlev, nblks_c, dim4d>; + // Create mirror views to store data on the host + auto cell_edge_idx_h = Kokkos::create_mirror_view(this->cell_edge_idx); + auto cell_edge_blk_h = Kokkos::create_mirror_view(this->cell_edge_blk); + auto geofac_div_h = Kokkos::create_mirror_view(this->geofac_div); + auto f4din_h = Kokkos::create_mirror_view(this->f4din); + auto f4dout_h = Kokkos::create_mirror_view(this->f4dout); + std::random_device rd; std::mt19937 gen(rd()); std::uniform_int_distribution<int> int_distrib(0, nproma - 1); @@ -473,19 +596,26 @@ TYPED_TEST(HorizontalDivTest, TestDiv4DRandom) { // Initialize with random values for (int i = 0; i < nproma; ++i) { for (int j = 0; j < 3; ++j) { - this->cell_edge_idx[cell_edge_at(i, 0, j)] = int_distrib(gen); - this->cell_edge_blk[cell_edge_at(i, 0, j)] = 0; - this->geofac_div[geofac_div_at(i, j, 0)] = real_distrib(gen); + cell_edge_idx_h[cell_edge_at(i, 0, j)] = int_distrib(gen); + cell_edge_blk_h[cell_edge_at(i, 0, j)] = 0; + geofac_div_h[geofac_div_at(i, j, 0)] = real_distrib(gen); } for (int k = 0; k < nlev; ++k) { for (int d = 0; d < dim4d; ++d) { - this->f4din[f4din_at(i, k, 0, d)] = real_distrib(gen); - this->f4dout[f4dout_at(i, k, 0, d)] = 0.0; + f4din_h[f4din_at(i, k, 0, d)] = real_distrib(gen); + f4dout_h[f4dout_at(i, k, 0, d)] = static_cast<TypeParam>(0.0); } } } + // Copy initialized data to device + Kokkos::deep_copy(this->cell_edge_idx, cell_edge_idx_h); + Kokkos::deep_copy(this->cell_edge_blk, cell_edge_blk_h); + Kokkos::deep_copy(this->geofac_div, geofac_div_h); + Kokkos::deep_copy(this->f4din, f4din_h); + Kokkos::deep_copy(this->f4dout, f4dout_h); + // Test function div4d<TypeParam>(this->cell_edge_idx.data(), this->cell_edge_blk.data(), this->geofac_div.data(), this->f4din.data(), @@ -494,25 +624,30 @@ TYPED_TEST(HorizontalDivTest, TestDiv4DRandom) { this->slev.data(), this->elev.data(), this->nproma, this->lacc, this->nlev, this->nblks_c, this->nblks_e); + // Copy results back to host for verification + Kokkos::deep_copy(f4dout_h, this->f4dout); + + TypeParam tol = std::is_same<TypeParam, float>::value ? + static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13); + // Compute reference result and check - for (int jb = this->i_startblk; jb < this->i_endblk; ++jb) { + for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) { int i_startidx, i_endidx; get_indices_c_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb, this->i_startblk, this->i_endblk, i_startidx, i_endidx); for (int ji = 0; ji < dim4d; ++ji) { - for (int jk = this->slev[ji]; jk < this->elev[ji]; ++jk) { - for (int jc = i_startidx; jc < i_endidx; ++jc) { + for (int jk = this->slev[ji]; jk <= this->elev[ji]; ++jk) { + for (int jc = i_startidx; jc <= i_endidx; ++jc) { TypeParam expected = 0.0; for (int je = 0; je < 3; ++je) { - expected += - this->f4din[f4din_at( - this->cell_edge_idx[cell_edge_at(jc, jb, je)], jk, - this->cell_edge_blk[cell_edge_at(jc, jb, je)], ji)] * - this->geofac_div[geofac_div_at(jc, je, jb)]; + expected += f4din_h[f4din_at( + cell_edge_idx_h[cell_edge_at(jc, jb, je)], jk, + cell_edge_blk_h[cell_edge_at(jc, jb, je)], ji)] * + geofac_div_h[geofac_div_at(jc, je, jb)]; } - EXPECT_NEAR(this->f4dout[f4dout_at(jc, jk, jb, ji)], expected, 1e-5) + EXPECT_NEAR(f4dout_h[f4dout_at(jc, jk, jb, ji)], expected, tol) << "Random test fails at jc=" << jc << ", jk=" << jk << ", jb=" << jb << ", ji=" << ji; } @@ -521,77 +656,103 @@ TYPED_TEST(HorizontalDivTest, TestDiv4DRandom) { } } -TYPED_TEST_SUITE(HorizontalDivTest, ValueTypes); - TYPED_TEST(HorizontalDivTest, TestDivAvgSpecific) { constexpr int nproma = this->nproma; constexpr int nlev = this->nlev; constexpr int nblks_c = this->nblks_c; constexpr int nblks_e = this->nblks_e; - constexpr int dim4d = this->dim4d; const auto &vec_e_at = at<nproma, nlev, nblks_e>; const auto &cell_edge_at = at<nproma, nblks_c, 3>; const auto &geofac_div_at = at<nproma, 3, nblks_c>; const auto &div_vec_c_at = at<nproma, nlev, nblks_c>; + const auto &cell_neighbor_at = at<nproma, nblks_c, 3>; + const auto &avg_coeff_at = at<nproma, 4, nblks_c>; // Vectors for additional parameters // Vectors for block and index ranges std::vector<int> i_startblk_in(3, 0); - std::vector<int> i_endblk_in(3, nblks_c); + std::vector<int> i_endblk_in(3, nblks_c - 1); std::vector<int> i_startidx_in(3, 0); - std::vector<int> i_endidx_in(3, nproma); + std::vector<int> i_endidx_in(3, nproma - 1); // Parameters for the test int patch_id = 1; bool l_limited_area = true; bool l2fields = true; - const auto &cell_neighbor_at = at<nproma, nblks_c, 3>; - const auto &avg_coeff_at = at<nproma, 4, nblks_c>; + // Create mirror views to store data on the host + auto vec_e_h = Kokkos::create_mirror_view(this->vec_e); + auto cell_neighbor_idx_h = + Kokkos::create_mirror_view(this->cell_neighbor_idx); + auto cell_neighbor_blk_h = + Kokkos::create_mirror_view(this->cell_neighbor_blk); + auto cell_edge_idx_h = Kokkos::create_mirror_view(this->cell_edge_idx); + auto cell_edge_blk_h = Kokkos::create_mirror_view(this->cell_edge_blk); + auto geofac_div_h = Kokkos::create_mirror_view(this->geofac_div); + auto avg_coeff_h = Kokkos::create_mirror_view(this->avg_coeff); + auto div_vec_c_h = Kokkos::create_mirror_view(this->div_vec_c); + auto opt_in2_h = Kokkos::create_mirror_view(this->opt_in2); + auto opt_out2_h = Kokkos::create_mirror_view(this->opt_out2); // Initialize the vectors with specific values for (int i = 0; i < nproma; ++i) { for (int k = 0; k < nlev; ++k) { - this->vec_e[vec_e_at(i, k, 0)] = (i + 1) * (k + 1); // Simple pattern - this->opt_in2[vec_e_at(i, k, 0)] = - (i + 1) * (k + 1) * 0.5; // Half of vec_e + vec_e_h[vec_e_at(i, k, 0)] = + static_cast<TypeParam>((i + 1) * (k + 1)); // Simple pattern + opt_in2_h[vec_e_at(i, k, 0)] = + static_cast<TypeParam>((i + 1) * (k + 1) * 0.5); // Half of vec_e } // Set edge indices to point to specific cells - this->cell_edge_idx[cell_edge_at(i, 0, 0)] = i; - this->cell_edge_idx[cell_edge_at(i, 0, 1)] = (i + 1) % nproma; - this->cell_edge_idx[cell_edge_at(i, 0, 2)] = (i + 2) % nproma; + cell_edge_idx_h[cell_edge_at(i, 0, 0)] = i; + cell_edge_idx_h[cell_edge_at(i, 0, 1)] = (i + 1) % nproma; + cell_edge_idx_h[cell_edge_at(i, 0, 2)] = (i + 2) % nproma; // Set neighbor indices similarly - this->cell_neighbor_idx[cell_neighbor_at(i, 0, 0)] = i; - this->cell_neighbor_idx[cell_neighbor_at(i, 0, 1)] = (i + 1) % nproma; - this->cell_neighbor_idx[cell_neighbor_at(i, 0, 2)] = (i + 2) % nproma; + cell_neighbor_idx_h[cell_neighbor_at(i, 0, 0)] = i; + cell_neighbor_idx_h[cell_neighbor_at(i, 0, 1)] = (i + 1) % nproma; + cell_neighbor_idx_h[cell_neighbor_at(i, 0, 2)] = (i + 2) % nproma; // All edges and neighbors are in the same block for this test for (int j = 0; j < 3; ++j) { - this->cell_edge_blk[cell_edge_at(i, 0, j)] = 0; - this->cell_neighbor_blk[cell_neighbor_at(i, 0, j)] = 0; + cell_edge_blk_h[cell_edge_at(i, 0, j)] = 0; + cell_neighbor_blk_h[cell_neighbor_at(i, 0, j)] = 0; } // Geometric factors - this->geofac_div[geofac_div_at(i, 0, 0)] = 0.5; - this->geofac_div[geofac_div_at(i, 1, 0)] = 0.3; - this->geofac_div[geofac_div_at(i, 2, 0)] = 0.2; + geofac_div_h[geofac_div_at(i, 0, 0)] = static_cast<TypeParam>(0.5); + geofac_div_h[geofac_div_at(i, 1, 0)] = static_cast<TypeParam>(0.3); + geofac_div_h[geofac_div_at(i, 2, 0)] = static_cast<TypeParam>(0.2); // Average coefficients - this->avg_coeff[avg_coeff_at(i, 0, 0)] = 0.4; // Self - this->avg_coeff[avg_coeff_at(i, 1, 0)] = 0.2; // First neighbor - this->avg_coeff[avg_coeff_at(i, 2, 0)] = 0.2; // Second neighbor - this->avg_coeff[avg_coeff_at(i, 3, 0)] = 0.2; // Third neighbor + avg_coeff_h[avg_coeff_at(i, 0, 0)] = static_cast<TypeParam>(0.4); // Self + avg_coeff_h[avg_coeff_at(i, 1, 0)] = + static_cast<TypeParam>(0.2); // First neighbor + avg_coeff_h[avg_coeff_at(i, 2, 0)] = + static_cast<TypeParam>(0.2); // Second neighbor + avg_coeff_h[avg_coeff_at(i, 3, 0)] = + static_cast<TypeParam>(0.2); // Third neighbor // Initialize div_vec_c and opt_out2 to zero for (int k = 0; k < nlev; ++k) { - this->div_vec_c[div_vec_c_at(i, k, 0)] = 0.0; - this->opt_out2[div_vec_c_at(i, k, 0)] = 0.0; + div_vec_c_h[div_vec_c_at(i, k, 0)] = static_cast<TypeParam>(0.0); + opt_out2_h[div_vec_c_at(i, k, 0)] = static_cast<TypeParam>(0.0); } } + // Copy initialized data to device + Kokkos::deep_copy(this->vec_e, vec_e_h); + Kokkos::deep_copy(this->cell_neighbor_idx, cell_neighbor_idx_h); + Kokkos::deep_copy(this->cell_neighbor_blk, cell_neighbor_blk_h); + Kokkos::deep_copy(this->cell_edge_idx, cell_edge_idx_h); + Kokkos::deep_copy(this->cell_edge_blk, cell_edge_blk_h); + Kokkos::deep_copy(this->geofac_div, geofac_div_h); + Kokkos::deep_copy(this->avg_coeff, avg_coeff_h); + Kokkos::deep_copy(this->div_vec_c, div_vec_c_h); + Kokkos::deep_copy(this->opt_in2, opt_in2_h); + Kokkos::deep_copy(this->opt_out2, opt_out2_h); + // Call the div_avg function div_avg<TypeParam>( this->vec_e.data(), this->cell_neighbor_idx.data(), @@ -603,19 +764,37 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgSpecific) { this->nproma, patch_id, l_limited_area, l2fields, this->lacc, this->nlev, this->nblks_c, this->nblks_e); - EXPECT_NEAR(this->div_vec_c[div_vec_c_at(0, 0, 0)], 1.88, 1e-6); - EXPECT_NEAR(this->div_vec_c[div_vec_c_at(0, 1, 0)], 3.76, 1e-6); - EXPECT_NEAR(this->div_vec_c[div_vec_c_at(1, 0, 0)], 2.04, 1e-6); - EXPECT_NEAR(this->div_vec_c[div_vec_c_at(1, 1, 0)], 4.08, 1e-6); - EXPECT_NEAR(this->div_vec_c[div_vec_c_at(2, 0, 0)], 2.08, 1e-6); - EXPECT_NEAR(this->div_vec_c[div_vec_c_at(2, 1, 0)], 4.16, 1e-6); - - EXPECT_NEAR(this->opt_out2[div_vec_c_at(0, 0, 0)], 0.94, 1e-6); - EXPECT_NEAR(this->opt_out2[div_vec_c_at(0, 1, 0)], 1.88, 1e-6); - EXPECT_NEAR(this->opt_out2[div_vec_c_at(1, 0, 0)], 1.02, 1e-6); - EXPECT_NEAR(this->opt_out2[div_vec_c_at(1, 1, 0)], 2.04, 1e-6); - EXPECT_NEAR(this->opt_out2[div_vec_c_at(2, 0, 0)], 1.04, 1e-6); - EXPECT_NEAR(this->opt_out2[div_vec_c_at(2, 1, 0)], 2.08, 1e-6); + // Copy results back to host for verification + Kokkos::deep_copy(div_vec_c_h, this->div_vec_c); + Kokkos::deep_copy(opt_out2_h, this->opt_out2); + + // Verify first field results + EXPECT_NEAR(div_vec_c_h[div_vec_c_at(0, 0, 0)], static_cast<TypeParam>(1.88), + 1e-6); + EXPECT_NEAR(div_vec_c_h[div_vec_c_at(0, 1, 0)], static_cast<TypeParam>(3.76), + 1e-6); + EXPECT_NEAR(div_vec_c_h[div_vec_c_at(1, 0, 0)], static_cast<TypeParam>(2.04), + 1e-6); + EXPECT_NEAR(div_vec_c_h[div_vec_c_at(1, 1, 0)], static_cast<TypeParam>(4.08), + 1e-6); + EXPECT_NEAR(div_vec_c_h[div_vec_c_at(2, 0, 0)], static_cast<TypeParam>(2.08), + 1e-6); + EXPECT_NEAR(div_vec_c_h[div_vec_c_at(2, 1, 0)], static_cast<TypeParam>(4.16), + 1e-6); + + // Verify second field results + EXPECT_NEAR(opt_out2_h[div_vec_c_at(0, 0, 0)], static_cast<TypeParam>(0.94), + 1e-6); + EXPECT_NEAR(opt_out2_h[div_vec_c_at(0, 1, 0)], static_cast<TypeParam>(1.88), + 1e-6); + EXPECT_NEAR(opt_out2_h[div_vec_c_at(1, 0, 0)], static_cast<TypeParam>(1.02), + 1e-6); + EXPECT_NEAR(opt_out2_h[div_vec_c_at(1, 1, 0)], static_cast<TypeParam>(2.04), + 1e-6); + EXPECT_NEAR(opt_out2_h[div_vec_c_at(2, 0, 0)], static_cast<TypeParam>(1.04), + 1e-6); + EXPECT_NEAR(opt_out2_h[div_vec_c_at(2, 1, 0)], static_cast<TypeParam>(2.08), + 1e-6); } TYPED_TEST(HorizontalDivTest, TestDivAvgRandom) { @@ -631,9 +810,9 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgRandom) { // Vectors for block and index ranges std::vector<int> i_startblk_in(3, 0); - std::vector<int> i_endblk_in(3, nblks_c); + std::vector<int> i_endblk_in(3, nblks_c - 1); std::vector<int> i_startidx_in(3, 0); - std::vector<int> i_endidx_in(3, nproma); + std::vector<int> i_endidx_in(3, nproma - 1); // Parameters for the test int patch_id = 1; @@ -643,47 +822,73 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgRandom) { const auto &cell_neighbor_at = at<nproma, nblks_c, 3>; const auto &avg_coeff_at = at<nproma, 4, nblks_c>; + // Create mirror views to store data on the host + auto vec_e_h = Kokkos::create_mirror_view(this->vec_e); + auto cell_neighbor_idx_h = + Kokkos::create_mirror_view(this->cell_neighbor_idx); + auto cell_neighbor_blk_h = + Kokkos::create_mirror_view(this->cell_neighbor_blk); + auto cell_edge_idx_h = Kokkos::create_mirror_view(this->cell_edge_idx); + auto cell_edge_blk_h = Kokkos::create_mirror_view(this->cell_edge_blk); + auto geofac_div_h = Kokkos::create_mirror_view(this->geofac_div); + auto avg_coeff_h = Kokkos::create_mirror_view(this->avg_coeff); + auto div_vec_c_h = Kokkos::create_mirror_view(this->div_vec_c); + auto opt_in2_h = Kokkos::create_mirror_view(this->opt_in2); + auto opt_out2_h = Kokkos::create_mirror_view(this->opt_out2); + // Set up random number generators std::random_device rd; std::mt19937 gen(rd()); std::uniform_int_distribution<int> int_distrib(0, nproma - 1); - std::uniform_real_distribution<TypeParam> real_distrib(-10.0, 10.0); + std::uniform_real_distribution<TypeParam> real_distrib(-1.0, 1.0); // Initialize with random values for (int i = 0; i < nproma; ++i) { for (int k = 0; k < nlev; ++k) { - this->vec_e[vec_e_at(i, k, 0)] = real_distrib(gen); - this->opt_in2[vec_e_at(i, k, 0)] = real_distrib(gen); + vec_e_h[vec_e_at(i, k, 0)] = real_distrib(gen); + opt_in2_h[vec_e_at(i, k, 0)] = real_distrib(gen); } // Set random edge indices for (int j = 0; j < 3; ++j) { - this->cell_edge_idx[cell_edge_at(i, 0, j)] = int_distrib(gen); - this->cell_edge_blk[cell_edge_at(i, 0, j)] = + cell_edge_idx_h[cell_edge_at(i, 0, j)] = int_distrib(gen); + cell_edge_blk_h[cell_edge_at(i, 0, j)] = 0; // Keep in same block for simplicity - this->cell_neighbor_idx[cell_neighbor_at(i, 0, j)] = int_distrib(gen); - this->cell_neighbor_blk[cell_neighbor_at(i, 0, j)] = + cell_neighbor_idx_h[cell_neighbor_at(i, 0, j)] = int_distrib(gen); + cell_neighbor_blk_h[cell_neighbor_at(i, 0, j)] = 0; // Keep in same block for simplicity } // Random geometric factors for (int j = 0; j < 3; ++j) { - this->geofac_div[geofac_div_at(i, j, 0)] = real_distrib(gen); + geofac_div_h[geofac_div_at(i, j, 0)] = real_distrib(gen); } // Random average coefficients for (int j = 0; j < 4; ++j) { - this->avg_coeff[avg_coeff_at(i, j, 0)] = real_distrib(gen); + avg_coeff_h[avg_coeff_at(i, j, 0)] = real_distrib(gen); } // Random initial values for div_vec_c and opt_out2 for (int k = 0; k < nlev; ++k) { - this->div_vec_c[div_vec_c_at(i, k, 0)] = real_distrib(gen); - this->opt_out2[div_vec_c_at(i, k, 0)] = real_distrib(gen); + div_vec_c_h[div_vec_c_at(i, k, 0)] = static_cast<TypeParam>(0.0); + opt_out2_h[div_vec_c_at(i, k, 0)] = static_cast<TypeParam>(0.0); } } + // Copy initialized data to device + Kokkos::deep_copy(this->vec_e, vec_e_h); + Kokkos::deep_copy(this->cell_neighbor_idx, cell_neighbor_idx_h); + Kokkos::deep_copy(this->cell_neighbor_blk, cell_neighbor_blk_h); + Kokkos::deep_copy(this->cell_edge_idx, cell_edge_idx_h); + Kokkos::deep_copy(this->cell_edge_blk, cell_edge_blk_h); + Kokkos::deep_copy(this->geofac_div, geofac_div_h); + Kokkos::deep_copy(this->avg_coeff, avg_coeff_h); + Kokkos::deep_copy(this->div_vec_c, div_vec_c_h); + Kokkos::deep_copy(this->opt_in2, opt_in2_h); + Kokkos::deep_copy(this->opt_out2, opt_out2_h); + // Call the div_avg function div_avg<TypeParam>( this->vec_e.data(), this->cell_neighbor_idx.data(), @@ -695,6 +900,10 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgRandom) { this->nproma, patch_id, l_limited_area, l2fields, this->lacc, this->nlev, this->nblks_c, this->nblks_e); + // Copy results back to host for verification + Kokkos::deep_copy(div_vec_c_h, this->div_vec_c); + Kokkos::deep_copy(opt_out2_h, this->opt_out2); + // Calculate reference values manually std::vector<TypeParam> aux_c(dim_combine(nproma, nlev, nblks_c)); std::vector<TypeParam> aux_c2(dim_combine(nproma, nlev, nblks_c)); @@ -702,52 +911,46 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgRandom) { std::vector<TypeParam> ref_opt_out2(dim_combine(nproma, nlev, nblks_c)); // Step 1: Calculate aux_c and aux_c2 - for (int jb = i_startblk_in[0]; jb < i_endblk_in[0]; ++jb) { + for (int jb = i_startblk_in[0]; jb <= i_endblk_in[0]; ++jb) { int i_startidx, i_endidx; get_indices_c_lib(i_startidx_in[0], i_endidx_in[0], nproma, jb, i_startblk_in[0], i_endblk_in[0], i_startidx, i_endidx); - for (int jk = this->slev[0]; jk < this->elev[0]; ++jk) { - for (int jc = i_startidx; jc < i_endidx; ++jc) { + for (int jk = this->slev[0]; jk <= this->elev[0]; ++jk) { + for (int jc = i_startidx; jc <= i_endidx; ++jc) { aux_c[div_vec_c_at(jc, jk, jb)] = - this->vec_e[vec_e_at( - this->cell_edge_idx[cell_edge_at(jc, jb, 0)], jk, - this->cell_edge_blk[cell_edge_at(jc, jb, 0)])] * - this->geofac_div[geofac_div_at(jc, 0, jb)] + - this->vec_e[vec_e_at( - this->cell_edge_idx[cell_edge_at(jc, jb, 1)], jk, - this->cell_edge_blk[cell_edge_at(jc, jb, 1)])] * - this->geofac_div[geofac_div_at(jc, 1, jb)] + - this->vec_e[vec_e_at( - this->cell_edge_idx[cell_edge_at(jc, jb, 2)], jk, - this->cell_edge_blk[cell_edge_at(jc, jb, 2)])] * - this->geofac_div[geofac_div_at(jc, 2, jb)]; + vec_e_h[vec_e_at(cell_edge_idx_h[cell_edge_at(jc, jb, 0)], jk, + cell_edge_blk_h[cell_edge_at(jc, jb, 0)])] * + geofac_div_h[geofac_div_at(jc, 0, jb)] + + vec_e_h[vec_e_at(cell_edge_idx_h[cell_edge_at(jc, jb, 1)], jk, + cell_edge_blk_h[cell_edge_at(jc, jb, 1)])] * + geofac_div_h[geofac_div_at(jc, 1, jb)] + + vec_e_h[vec_e_at(cell_edge_idx_h[cell_edge_at(jc, jb, 2)], jk, + cell_edge_blk_h[cell_edge_at(jc, jb, 2)])] * + geofac_div_h[geofac_div_at(jc, 2, jb)]; aux_c2[div_vec_c_at(jc, jk, jb)] = - this->opt_in2[vec_e_at( - this->cell_edge_idx[cell_edge_at(jc, jb, 0)], jk, - this->cell_edge_blk[cell_edge_at(jc, jb, 0)])] * - this->geofac_div[geofac_div_at(jc, 0, jb)] + - this->opt_in2[vec_e_at( - this->cell_edge_idx[cell_edge_at(jc, jb, 1)], jk, - this->cell_edge_blk[cell_edge_at(jc, jb, 1)])] * - this->geofac_div[geofac_div_at(jc, 1, jb)] + - this->opt_in2[vec_e_at( - this->cell_edge_idx[cell_edge_at(jc, jb, 2)], jk, - this->cell_edge_blk[cell_edge_at(jc, jb, 2)])] * - this->geofac_div[geofac_div_at(jc, 2, jb)]; + opt_in2_h[vec_e_at(cell_edge_idx_h[cell_edge_at(jc, jb, 0)], jk, + cell_edge_blk_h[cell_edge_at(jc, jb, 0)])] * + geofac_div_h[geofac_div_at(jc, 0, jb)] + + opt_in2_h[vec_e_at(cell_edge_idx_h[cell_edge_at(jc, jb, 1)], jk, + cell_edge_blk_h[cell_edge_at(jc, jb, 1)])] * + geofac_div_h[geofac_div_at(jc, 1, jb)] + + opt_in2_h[vec_e_at(cell_edge_idx_h[cell_edge_at(jc, jb, 2)], jk, + cell_edge_blk_h[cell_edge_at(jc, jb, 2)])] * + geofac_div_h[geofac_div_at(jc, 2, jb)]; } } } // Step 2: Assign aux_c to div_vec_c and aux_c2 to opt_out2 for patch_id > 0 - for (int jb = i_startblk_in[1]; jb < i_endblk_in[1]; ++jb) { + for (int jb = i_startblk_in[1]; jb <= i_endblk_in[1]; ++jb) { int i_startidx, i_endidx; get_indices_c_lib(i_startidx_in[1], i_endidx_in[1], nproma, jb, i_startblk_in[1], i_endblk_in[1], i_startidx, i_endidx); - for (int jk = this->slev[0]; jk < this->elev[0]; ++jk) { - for (int jc = i_startidx; jc < i_endidx; ++jc) { + for (int jk = this->slev[0]; jk <= this->elev[0]; ++jk) { + for (int jc = i_startidx; jc <= i_endidx; ++jc) { ref_div_vec_c[div_vec_c_at(jc, jk, jb)] = aux_c[div_vec_c_at(jc, jk, jb)]; ref_opt_out2[div_vec_c_at(jc, jk, jb)] = @@ -757,57 +960,60 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgRandom) { } // Step 3: Perform averaging for the rest of the blocks - for (int jb = i_startblk_in[2]; jb < i_endblk_in[2]; ++jb) { + for (int jb = i_startblk_in[2]; jb <= i_endblk_in[2]; ++jb) { int i_startidx, i_endidx; get_indices_c_lib(i_startidx_in[2], i_endidx_in[2], nproma, jb, i_startblk_in[2], i_endblk_in[2], i_startidx, i_endidx); - for (int jk = this->slev[0]; jk < this->elev[0]; ++jk) { - for (int jc = i_startidx; jc < i_endidx; ++jc) { + for (int jk = this->slev[0]; jk <= this->elev[0]; ++jk) { + for (int jc = i_startidx; jc <= i_endidx; ++jc) { ref_div_vec_c[div_vec_c_at(jc, jk, jb)] = aux_c[div_vec_c_at(jc, jk, jb)] * - this->avg_coeff[avg_coeff_at(jc, 0, jb)] + + avg_coeff_h[avg_coeff_at(jc, 0, jb)] + aux_c[div_vec_c_at( - this->cell_neighbor_idx[cell_neighbor_at(jc, jb, 0)], jk, - this->cell_neighbor_blk[cell_neighbor_at(jc, jb, 0)])] * - this->avg_coeff[avg_coeff_at(jc, 1, jb)] + + cell_neighbor_idx_h[cell_neighbor_at(jc, jb, 0)], jk, + cell_neighbor_blk_h[cell_neighbor_at(jc, jb, 0)])] * + avg_coeff_h[avg_coeff_at(jc, 1, jb)] + aux_c[div_vec_c_at( - this->cell_neighbor_idx[cell_neighbor_at(jc, jb, 1)], jk, - this->cell_neighbor_blk[cell_neighbor_at(jc, jb, 1)])] * - this->avg_coeff[avg_coeff_at(jc, 2, jb)] + + cell_neighbor_idx_h[cell_neighbor_at(jc, jb, 1)], jk, + cell_neighbor_blk_h[cell_neighbor_at(jc, jb, 1)])] * + avg_coeff_h[avg_coeff_at(jc, 2, jb)] + aux_c[div_vec_c_at( - this->cell_neighbor_idx[cell_neighbor_at(jc, jb, 2)], jk, - this->cell_neighbor_blk[cell_neighbor_at(jc, jb, 2)])] * - this->avg_coeff[avg_coeff_at(jc, 3, jb)]; + cell_neighbor_idx_h[cell_neighbor_at(jc, jb, 2)], jk, + cell_neighbor_blk_h[cell_neighbor_at(jc, jb, 2)])] * + avg_coeff_h[avg_coeff_at(jc, 3, jb)]; ref_opt_out2[div_vec_c_at(jc, jk, jb)] = aux_c2[div_vec_c_at(jc, jk, jb)] * - this->avg_coeff[avg_coeff_at(jc, 0, jb)] + + avg_coeff_h[avg_coeff_at(jc, 0, jb)] + aux_c2[div_vec_c_at( - this->cell_neighbor_idx[cell_neighbor_at(jc, jb, 0)], jk, - this->cell_neighbor_blk[cell_neighbor_at(jc, jb, 0)])] * - this->avg_coeff[avg_coeff_at(jc, 1, jb)] + + cell_neighbor_idx_h[cell_neighbor_at(jc, jb, 0)], jk, + cell_neighbor_blk_h[cell_neighbor_at(jc, jb, 0)])] * + avg_coeff_h[avg_coeff_at(jc, 1, jb)] + aux_c2[div_vec_c_at( - this->cell_neighbor_idx[cell_neighbor_at(jc, jb, 1)], jk, - this->cell_neighbor_blk[cell_neighbor_at(jc, jb, 1)])] * - this->avg_coeff[avg_coeff_at(jc, 2, jb)] + + cell_neighbor_idx_h[cell_neighbor_at(jc, jb, 1)], jk, + cell_neighbor_blk_h[cell_neighbor_at(jc, jb, 1)])] * + avg_coeff_h[avg_coeff_at(jc, 2, jb)] + aux_c2[div_vec_c_at( - this->cell_neighbor_idx[cell_neighbor_at(jc, jb, 2)], jk, - this->cell_neighbor_blk[cell_neighbor_at(jc, jb, 2)])] * - this->avg_coeff[avg_coeff_at(jc, 3, jb)]; + cell_neighbor_idx_h[cell_neighbor_at(jc, jb, 2)], jk, + cell_neighbor_blk_h[cell_neighbor_at(jc, jb, 2)])] * + avg_coeff_h[avg_coeff_at(jc, 3, jb)]; } } } + TypeParam tol = std::is_same<TypeParam, float>::value ? + static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13); + // Verify results for (int i = 0; i < nproma; ++i) { for (int k = 0; k < nlev; ++k) { - EXPECT_NEAR(this->div_vec_c[div_vec_c_at(i, k, 0)], - ref_div_vec_c[div_vec_c_at(i, k, 0)], 1e-5) + EXPECT_NEAR(div_vec_c_h[div_vec_c_at(i, k, 0)], + ref_div_vec_c[div_vec_c_at(i, k, 0)], tol) << "div_vec_c results differ at i=" << i << ", k=" << k; - EXPECT_NEAR(this->opt_out2[div_vec_c_at(i, k, 0)], - ref_opt_out2[div_vec_c_at(i, k, 0)], 1e-5) + EXPECT_NEAR(opt_out2_h[div_vec_c_at(i, k, 0)], + ref_opt_out2[div_vec_c_at(i, k, 0)], tol) << "opt_out2 results differ at i=" << i << ", k=" << k; } } @@ -818,7 +1024,6 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgSpecificNoL2fields) { constexpr int nlev = this->nlev; constexpr int nblks_c = this->nblks_c; constexpr int nblks_e = this->nblks_e; - constexpr int dim4d = this->dim4d; const auto &vec_e_at = at<nproma, nlev, nblks_e>; const auto &cell_edge_at = at<nproma, nblks_c, 3>; @@ -827,9 +1032,9 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgSpecificNoL2fields) { // Vectors for block and index ranges std::vector<int> i_startblk_in(3, 0); - std::vector<int> i_endblk_in(3, nblks_c); + std::vector<int> i_endblk_in(3, nblks_c - 1); std::vector<int> i_startidx_in(3, 0); - std::vector<int> i_endidx_in(3, nproma); + std::vector<int> i_endidx_in(3, nproma - 1); // Parameters for the test int patch_id = 1; @@ -839,48 +1044,78 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgSpecificNoL2fields) { const auto &cell_neighbor_at = at<nproma, nblks_c, 3>; const auto &avg_coeff_at = at<nproma, 4, nblks_c>; + // Create mirror views to store data on the host + auto vec_e_h = Kokkos::create_mirror_view(this->vec_e); + auto cell_neighbor_idx_h = + Kokkos::create_mirror_view(this->cell_neighbor_idx); + auto cell_neighbor_blk_h = + Kokkos::create_mirror_view(this->cell_neighbor_blk); + auto cell_edge_idx_h = Kokkos::create_mirror_view(this->cell_edge_idx); + auto cell_edge_blk_h = Kokkos::create_mirror_view(this->cell_edge_blk); + auto geofac_div_h = Kokkos::create_mirror_view(this->geofac_div); + auto avg_coeff_h = Kokkos::create_mirror_view(this->avg_coeff); + auto div_vec_c_h = Kokkos::create_mirror_view(this->div_vec_c); + auto opt_in2_h = Kokkos::create_mirror_view(this->opt_in2); + auto opt_out2_h = Kokkos::create_mirror_view(this->opt_out2); + // Initialize the vectors with specific values for (int i = 0; i < nproma; ++i) { for (int k = 0; k < nlev; ++k) { - this->vec_e[vec_e_at(i, k, 0)] = (i + 1) * (k + 1); // Simple pattern - this->opt_in2[vec_e_at(i, k, 0)] = - (i + 1) * (k + 1) * 0.5; // Half of vec_e + vec_e_h[vec_e_at(i, k, 0)] = + static_cast<TypeParam>((i + 1) * (k + 1)); // Simple pattern + opt_in2_h[vec_e_at(i, k, 0)] = + static_cast<TypeParam>((i + 1) * (k + 1) * 0.5); // Half of vec_e } // Set edge indices to point to specific cells - this->cell_edge_idx[cell_edge_at(i, 0, 0)] = i; - this->cell_edge_idx[cell_edge_at(i, 0, 1)] = (i + 1) % nproma; - this->cell_edge_idx[cell_edge_at(i, 0, 2)] = (i + 2) % nproma; + cell_edge_idx_h[cell_edge_at(i, 0, 0)] = i; + cell_edge_idx_h[cell_edge_at(i, 0, 1)] = (i + 1) % nproma; + cell_edge_idx_h[cell_edge_at(i, 0, 2)] = (i + 2) % nproma; // Set neighbor indices similarly - this->cell_neighbor_idx[cell_neighbor_at(i, 0, 0)] = i; - this->cell_neighbor_idx[cell_neighbor_at(i, 0, 1)] = (i + 1) % nproma; - this->cell_neighbor_idx[cell_neighbor_at(i, 0, 2)] = (i + 2) % nproma; + cell_neighbor_idx_h[cell_neighbor_at(i, 0, 0)] = i; + cell_neighbor_idx_h[cell_neighbor_at(i, 0, 1)] = (i + 1) % nproma; + cell_neighbor_idx_h[cell_neighbor_at(i, 0, 2)] = (i + 2) % nproma; // All edges and neighbors are in the same block for this test for (int j = 0; j < 3; ++j) { - this->cell_edge_blk[cell_edge_at(i, 0, j)] = 0; - this->cell_neighbor_blk[cell_neighbor_at(i, 0, j)] = 0; + cell_edge_blk_h[cell_edge_at(i, 0, j)] = 0; + cell_neighbor_blk_h[cell_neighbor_at(i, 0, j)] = 0; } // Geometric factors - this->geofac_div[geofac_div_at(i, 0, 0)] = 0.5; - this->geofac_div[geofac_div_at(i, 1, 0)] = 0.3; - this->geofac_div[geofac_div_at(i, 2, 0)] = 0.2; + geofac_div_h[geofac_div_at(i, 0, 0)] = static_cast<TypeParam>(0.5); + geofac_div_h[geofac_div_at(i, 1, 0)] = static_cast<TypeParam>(0.3); + geofac_div_h[geofac_div_at(i, 2, 0)] = static_cast<TypeParam>(0.2); // Average coefficients - this->avg_coeff[avg_coeff_at(i, 0, 0)] = 0.4; // Self - this->avg_coeff[avg_coeff_at(i, 1, 0)] = 0.2; // First neighbor - this->avg_coeff[avg_coeff_at(i, 2, 0)] = 0.2; // Second neighbor - this->avg_coeff[avg_coeff_at(i, 3, 0)] = 0.2; // Third neighbor + avg_coeff_h[avg_coeff_at(i, 0, 0)] = static_cast<TypeParam>(0.4); // Self + avg_coeff_h[avg_coeff_at(i, 1, 0)] = + static_cast<TypeParam>(0.2); // First neighbor + avg_coeff_h[avg_coeff_at(i, 2, 0)] = + static_cast<TypeParam>(0.2); // Second neighbor + avg_coeff_h[avg_coeff_at(i, 3, 0)] = + static_cast<TypeParam>(0.2); // Third neighbor // Initialize div_vec_c and opt_out2 to zero for (int k = 0; k < nlev; ++k) { - this->div_vec_c[div_vec_c_at(i, k, 0)] = 0.0; - this->opt_out2[div_vec_c_at(i, k, 0)] = 0.0; + div_vec_c_h[div_vec_c_at(i, k, 0)] = static_cast<TypeParam>(0.0); + opt_out2_h[div_vec_c_at(i, k, 0)] = static_cast<TypeParam>(0.0); } } + // Copy initialized data to device + Kokkos::deep_copy(this->vec_e, vec_e_h); + Kokkos::deep_copy(this->cell_neighbor_idx, cell_neighbor_idx_h); + Kokkos::deep_copy(this->cell_neighbor_blk, cell_neighbor_blk_h); + Kokkos::deep_copy(this->cell_edge_idx, cell_edge_idx_h); + Kokkos::deep_copy(this->cell_edge_blk, cell_edge_blk_h); + Kokkos::deep_copy(this->geofac_div, geofac_div_h); + Kokkos::deep_copy(this->avg_coeff, avg_coeff_h); + Kokkos::deep_copy(this->div_vec_c, div_vec_c_h); + Kokkos::deep_copy(this->opt_in2, opt_in2_h); + Kokkos::deep_copy(this->opt_out2, opt_out2_h); + // Call the div_avg function div_avg<TypeParam>( this->vec_e.data(), this->cell_neighbor_idx.data(), @@ -892,19 +1127,37 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgSpecificNoL2fields) { this->nproma, patch_id, l_limited_area, l2fields, this->lacc, this->nlev, this->nblks_c, this->nblks_e); - EXPECT_NEAR(this->div_vec_c[div_vec_c_at(0, 0, 0)], 1.88, 1e-6); - EXPECT_NEAR(this->div_vec_c[div_vec_c_at(0, 1, 0)], 3.76, 1e-6); - EXPECT_NEAR(this->div_vec_c[div_vec_c_at(1, 0, 0)], 2.04, 1e-6); - EXPECT_NEAR(this->div_vec_c[div_vec_c_at(1, 1, 0)], 4.08, 1e-6); - EXPECT_NEAR(this->div_vec_c[div_vec_c_at(2, 0, 0)], 2.08, 1e-6); - EXPECT_NEAR(this->div_vec_c[div_vec_c_at(2, 1, 0)], 4.16, 1e-6); - - EXPECT_NEAR(this->opt_out2[div_vec_c_at(0, 0, 0)], 0.0, 1e-6); - EXPECT_NEAR(this->opt_out2[div_vec_c_at(0, 1, 0)], 0.0, 1e-6); - EXPECT_NEAR(this->opt_out2[div_vec_c_at(1, 0, 0)], 0.0, 1e-6); - EXPECT_NEAR(this->opt_out2[div_vec_c_at(1, 1, 0)], 0.0, 1e-6); - EXPECT_NEAR(this->opt_out2[div_vec_c_at(2, 0, 0)], 0.0, 1e-6); - EXPECT_NEAR(this->opt_out2[div_vec_c_at(2, 1, 0)], 0.0, 1e-6); + // Copy results back to host for verification + Kokkos::deep_copy(div_vec_c_h, this->div_vec_c); + Kokkos::deep_copy(opt_out2_h, this->opt_out2); + + // Verify first field results + EXPECT_NEAR(div_vec_c_h[div_vec_c_at(0, 0, 0)], static_cast<TypeParam>(1.88), + 1e-6); + EXPECT_NEAR(div_vec_c_h[div_vec_c_at(0, 1, 0)], static_cast<TypeParam>(3.76), + 1e-6); + EXPECT_NEAR(div_vec_c_h[div_vec_c_at(1, 0, 0)], static_cast<TypeParam>(2.04), + 1e-6); + EXPECT_NEAR(div_vec_c_h[div_vec_c_at(1, 1, 0)], static_cast<TypeParam>(4.08), + 1e-6); + EXPECT_NEAR(div_vec_c_h[div_vec_c_at(2, 0, 0)], static_cast<TypeParam>(2.08), + 1e-6); + EXPECT_NEAR(div_vec_c_h[div_vec_c_at(2, 1, 0)], static_cast<TypeParam>(4.16), + 1e-6); + + // Since l2fields=false, opt_out2 should not be modified + EXPECT_NEAR(opt_out2_h[div_vec_c_at(0, 0, 0)], static_cast<TypeParam>(0.0), + 1e-6); + EXPECT_NEAR(opt_out2_h[div_vec_c_at(0, 1, 0)], static_cast<TypeParam>(0.0), + 1e-6); + EXPECT_NEAR(opt_out2_h[div_vec_c_at(1, 0, 0)], static_cast<TypeParam>(0.0), + 1e-6); + EXPECT_NEAR(opt_out2_h[div_vec_c_at(1, 1, 0)], static_cast<TypeParam>(0.0), + 1e-6); + EXPECT_NEAR(opt_out2_h[div_vec_c_at(2, 0, 0)], static_cast<TypeParam>(0.0), + 1e-6); + EXPECT_NEAR(opt_out2_h[div_vec_c_at(2, 1, 0)], static_cast<TypeParam>(0.0), + 1e-6); } TYPED_TEST(HorizontalDivTest, TestDivAvgRandomNoL2fields) { @@ -920,9 +1173,9 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgRandomNoL2fields) { // Vectors for block and index ranges std::vector<int> i_startblk_in(3, 0); - std::vector<int> i_endblk_in(3, nblks_c); + std::vector<int> i_endblk_in(3, nblks_c - 1); std::vector<int> i_startidx_in(3, 0); - std::vector<int> i_endidx_in(3, nproma); + std::vector<int> i_endidx_in(3, nproma - 1); // Parameters for the test int patch_id = 1; @@ -932,49 +1185,75 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgRandomNoL2fields) { const auto &cell_neighbor_at = at<nproma, nblks_c, 3>; const auto &avg_coeff_at = at<nproma, 4, nblks_c>; + // Create mirror views to store data on the host + auto vec_e_h = Kokkos::create_mirror_view(this->vec_e); + auto cell_neighbor_idx_h = + Kokkos::create_mirror_view(this->cell_neighbor_idx); + auto cell_neighbor_blk_h = + Kokkos::create_mirror_view(this->cell_neighbor_blk); + auto cell_edge_idx_h = Kokkos::create_mirror_view(this->cell_edge_idx); + auto cell_edge_blk_h = Kokkos::create_mirror_view(this->cell_edge_blk); + auto geofac_div_h = Kokkos::create_mirror_view(this->geofac_div); + auto avg_coeff_h = Kokkos::create_mirror_view(this->avg_coeff); + auto div_vec_c_h = Kokkos::create_mirror_view(this->div_vec_c); + auto opt_in2_h = Kokkos::create_mirror_view(this->opt_in2); + auto opt_out2_h = Kokkos::create_mirror_view(this->opt_out2); + // Set up random number generators std::random_device rd; std::mt19937 gen(rd()); std::uniform_int_distribution<int> int_distrib(0, nproma - 1); - std::uniform_real_distribution<TypeParam> real_distrib(-10.0, 10.0); + std::uniform_real_distribution<TypeParam> real_distrib(-1.0, 1.0); // Initialize with random values for (int i = 0; i < nproma; ++i) { for (int k = 0; k < nlev; ++k) { - this->vec_e[vec_e_at(i, k, 0)] = real_distrib(gen); - this->opt_in2[vec_e_at(i, k, 0)] = + vec_e_h[vec_e_at(i, k, 0)] = real_distrib(gen); + opt_in2_h[vec_e_at(i, k, 0)] = real_distrib(gen); // Not used but initialize anyway } // Set random edge indices for (int j = 0; j < 3; ++j) { - this->cell_edge_idx[cell_edge_at(i, 0, j)] = int_distrib(gen); - this->cell_edge_blk[cell_edge_at(i, 0, j)] = + cell_edge_idx_h[cell_edge_at(i, 0, j)] = int_distrib(gen); + cell_edge_blk_h[cell_edge_at(i, 0, j)] = 0; // Keep in same block for simplicity - this->cell_neighbor_idx[cell_neighbor_at(i, 0, j)] = int_distrib(gen); - this->cell_neighbor_blk[cell_neighbor_at(i, 0, j)] = + cell_neighbor_idx_h[cell_neighbor_at(i, 0, j)] = int_distrib(gen); + cell_neighbor_blk_h[cell_neighbor_at(i, 0, j)] = 0; // Keep in same block for simplicity } // Random geometric factors for (int j = 0; j < 3; ++j) { - this->geofac_div[geofac_div_at(i, j, 0)] = real_distrib(gen); + geofac_div_h[geofac_div_at(i, j, 0)] = real_distrib(gen); } // Random average coefficients for (int j = 0; j < 4; ++j) { - this->avg_coeff[avg_coeff_at(i, j, 0)] = real_distrib(gen); + avg_coeff_h[avg_coeff_at(i, j, 0)] = real_distrib(gen); } // Random initial values for div_vec_c and opt_out2 for (int k = 0; k < nlev; ++k) { - this->div_vec_c[div_vec_c_at(i, k, 0)] = real_distrib(gen); - this->opt_out2[div_vec_c_at(i, k, 0)] = - real_distrib(gen); // Not used but initialize anyway + div_vec_c_h[div_vec_c_at(i, k, 0)] = static_cast<TypeParam>(0.0); + opt_out2_h[div_vec_c_at(i, k, 0)] = + static_cast<TypeParam>(0.0); // Not used but initialize anyway } } + // Copy initialized data to device + Kokkos::deep_copy(this->vec_e, vec_e_h); + Kokkos::deep_copy(this->cell_neighbor_idx, cell_neighbor_idx_h); + Kokkos::deep_copy(this->cell_neighbor_blk, cell_neighbor_blk_h); + Kokkos::deep_copy(this->cell_edge_idx, cell_edge_idx_h); + Kokkos::deep_copy(this->cell_edge_blk, cell_edge_blk_h); + Kokkos::deep_copy(this->geofac_div, geofac_div_h); + Kokkos::deep_copy(this->avg_coeff, avg_coeff_h); + Kokkos::deep_copy(this->div_vec_c, div_vec_c_h); + Kokkos::deep_copy(this->opt_in2, opt_in2_h); + Kokkos::deep_copy(this->opt_out2, opt_out2_h); + // Call the div_avg function with l2fields=false div_avg<TypeParam>( this->vec_e.data(), this->cell_neighbor_idx.data(), @@ -986,44 +1265,45 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgRandomNoL2fields) { this->nproma, patch_id, l_limited_area, l2fields, this->lacc, this->nlev, this->nblks_c, this->nblks_e); + // Copy results back to host for verification + Kokkos::deep_copy(div_vec_c_h, this->div_vec_c); + Kokkos::deep_copy(opt_out2_h, this->opt_out2); + // Calculate reference values manually std::vector<TypeParam> aux_c(dim_combine(nproma, nlev, nblks_c)); std::vector<TypeParam> ref_div_vec_c(dim_combine(nproma, nlev, nblks_c)); // Step 1: Calculate aux_c (but not aux_c2 since l2fields=false) - for (int jb = i_startblk_in[0]; jb < i_endblk_in[0]; ++jb) { + for (int jb = i_startblk_in[0]; jb <= i_endblk_in[0]; ++jb) { int i_startidx, i_endidx; get_indices_c_lib(i_startidx_in[0], i_endidx_in[0], nproma, jb, i_startblk_in[0], i_endblk_in[0], i_startidx, i_endidx); - for (int jk = this->slev[0]; jk < this->elev[0]; ++jk) { - for (int jc = i_startidx; jc < i_endidx; ++jc) { + for (int jk = this->slev[0]; jk <= this->elev[0]; ++jk) { + for (int jc = i_startidx; jc <= i_endidx; ++jc) { aux_c[div_vec_c_at(jc, jk, jb)] = - this->vec_e[vec_e_at( - this->cell_edge_idx[cell_edge_at(jc, jb, 0)], jk, - this->cell_edge_blk[cell_edge_at(jc, jb, 0)])] * - this->geofac_div[geofac_div_at(jc, 0, jb)] + - this->vec_e[vec_e_at( - this->cell_edge_idx[cell_edge_at(jc, jb, 1)], jk, - this->cell_edge_blk[cell_edge_at(jc, jb, 1)])] * - this->geofac_div[geofac_div_at(jc, 1, jb)] + - this->vec_e[vec_e_at( - this->cell_edge_idx[cell_edge_at(jc, jb, 2)], jk, - this->cell_edge_blk[cell_edge_at(jc, jb, 2)])] * - this->geofac_div[geofac_div_at(jc, 2, jb)]; + vec_e_h[vec_e_at(cell_edge_idx_h[cell_edge_at(jc, jb, 0)], jk, + cell_edge_blk_h[cell_edge_at(jc, jb, 0)])] * + geofac_div_h[geofac_div_at(jc, 0, jb)] + + vec_e_h[vec_e_at(cell_edge_idx_h[cell_edge_at(jc, jb, 1)], jk, + cell_edge_blk_h[cell_edge_at(jc, jb, 1)])] * + geofac_div_h[geofac_div_at(jc, 1, jb)] + + vec_e_h[vec_e_at(cell_edge_idx_h[cell_edge_at(jc, jb, 2)], jk, + cell_edge_blk_h[cell_edge_at(jc, jb, 2)])] * + geofac_div_h[geofac_div_at(jc, 2, jb)]; } } } // Step 2: Assign aux_c to div_vec_c for patch_id > 0 (opt_out2 not updated // since l2fields=false) - for (int jb = i_startblk_in[1]; jb < i_endblk_in[1]; ++jb) { + for (int jb = i_startblk_in[1]; jb <= i_endblk_in[1]; ++jb) { int i_startidx, i_endidx; get_indices_c_lib(i_startidx_in[1], i_endidx_in[1], nproma, jb, i_startblk_in[1], i_endblk_in[1], i_startidx, i_endidx); - for (int jk = this->slev[0]; jk < this->elev[0]; ++jk) { - for (int jc = i_startidx; jc < i_endidx; ++jc) { + for (int jk = this->slev[0]; jk <= this->elev[0]; ++jk) { + for (int jc = i_startidx; jc <= i_endidx; ++jc) { ref_div_vec_c[div_vec_c_at(jc, jk, jb)] = aux_c[div_vec_c_at(jc, jk, jb)]; } @@ -1032,38 +1312,41 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgRandomNoL2fields) { // Step 3: Perform averaging for the rest of the blocks (only for div_vec_c, // not opt_out2) - for (int jb = i_startblk_in[2]; jb < i_endblk_in[2]; ++jb) { + for (int jb = i_startblk_in[2]; jb <= i_endblk_in[2]; ++jb) { int i_startidx, i_endidx; get_indices_c_lib(i_startidx_in[2], i_endidx_in[2], nproma, jb, i_startblk_in[2], i_endblk_in[2], i_startidx, i_endidx); - for (int jk = this->slev[0]; jk < this->elev[0]; ++jk) { - for (int jc = i_startidx; jc < i_endidx; ++jc) { + for (int jk = this->slev[0]; jk <= this->elev[0]; ++jk) { + for (int jc = i_startidx; jc <= i_endidx; ++jc) { ref_div_vec_c[div_vec_c_at(jc, jk, jb)] = aux_c[div_vec_c_at(jc, jk, jb)] * - this->avg_coeff[avg_coeff_at(jc, 0, jb)] + + avg_coeff_h[avg_coeff_at(jc, 0, jb)] + aux_c[div_vec_c_at( - this->cell_neighbor_idx[cell_neighbor_at(jc, jb, 0)], jk, - this->cell_neighbor_blk[cell_neighbor_at(jc, jb, 0)])] * - this->avg_coeff[avg_coeff_at(jc, 1, jb)] + + cell_neighbor_idx_h[cell_neighbor_at(jc, jb, 0)], jk, + cell_neighbor_blk_h[cell_neighbor_at(jc, jb, 0)])] * + avg_coeff_h[avg_coeff_at(jc, 1, jb)] + aux_c[div_vec_c_at( - this->cell_neighbor_idx[cell_neighbor_at(jc, jb, 1)], jk, - this->cell_neighbor_blk[cell_neighbor_at(jc, jb, 1)])] * - this->avg_coeff[avg_coeff_at(jc, 2, jb)] + + cell_neighbor_idx_h[cell_neighbor_at(jc, jb, 1)], jk, + cell_neighbor_blk_h[cell_neighbor_at(jc, jb, 1)])] * + avg_coeff_h[avg_coeff_at(jc, 2, jb)] + aux_c[div_vec_c_at( - this->cell_neighbor_idx[cell_neighbor_at(jc, jb, 2)], jk, - this->cell_neighbor_blk[cell_neighbor_at(jc, jb, 2)])] * - this->avg_coeff[avg_coeff_at(jc, 3, jb)]; + cell_neighbor_idx_h[cell_neighbor_at(jc, jb, 2)], jk, + cell_neighbor_blk_h[cell_neighbor_at(jc, jb, 2)])] * + avg_coeff_h[avg_coeff_at(jc, 3, jb)]; } } } + TypeParam tol = std::is_same<TypeParam, float>::value ? + static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13); + // Verify results - only check div_vec_c since l2fields=false means opt_out2 // isn't updated for (int i = 0; i < nproma; ++i) { for (int k = 0; k < nlev; ++k) { - EXPECT_NEAR(this->div_vec_c[div_vec_c_at(i, k, 0)], - ref_div_vec_c[div_vec_c_at(i, k, 0)], 1e-5) + EXPECT_NEAR(div_vec_c_h[div_vec_c_at(i, k, 0)], + ref_div_vec_c[div_vec_c_at(i, k, 0)], tol) << "div_vec_c results differ at i=" << i << ", k=" << k; } } diff --git a/test/c/test_horizontal_recon.cpp b/test/c/test_horizontal_recon.cpp index 8938a101bd2a350da12a0def450472d1e0c9ec9f..b83886c3d5cea1726fb4618ef66407ef06bb6d3a 100644 --- a/test/c/test_horizontal_recon.cpp +++ b/test/c/test_horizontal_recon.cpp @@ -14,8 +14,8 @@ #include <vector> #include <Kokkos_Core.hpp> -#include <gtest/gtest.h> #include <dim_helper.hpp> +#include <gtest/gtest.h> #include <horizontal/mo_lib_divrot.hpp> #include <support/mo_lib_loopindices.hpp> @@ -26,8 +26,8 @@ enum class ReconstructionMethod { cubic, }; -/// Base test class for the horizontal reconstruct tests. Templated for the ValueType -/// and ReconMethod for the reconstruction method. +/// Base test class for the horizontal reconstruct tests. Templated for the +/// ValueType and ReconMethod for the reconstruction method. template <typename ValueType, int ReconMethod> class HorizontalReconTest : public ::testing::Test { protected: @@ -41,13 +41,15 @@ protected: return std::make_tuple(9, 5); case ReconstructionMethod::cubic: return std::make_tuple(9, 9); + default: + return std::make_tuple(0, 0); // or throw/assert if appropriate } } // Constant dimensions. static constexpr int nproma = 3; // inner loop length - static constexpr int nlev = 1; // number of vertical levels - static constexpr int nblks_c = 1; // number of cell blocks (for p_e_in) + static constexpr int nlev = 2; // number of vertical levels + static constexpr int nblks_c = 2; // number of cell blocks (for p_e_in) static constexpr std::tuple<int, int> lsq_dim = init_lsq_dim(static_cast<ReconstructionMethod>(ReconMethod)); static constexpr int lsq_dim_c = std::get<0>(lsq_dim); @@ -55,39 +57,51 @@ protected: // Parameter values. int i_startblk = 0; - int i_endblk = nblks_c; // Test blocks [0 .. nblks_c-1] + int i_endblk = nblks_c - 1; // Test blocks [0 .. nblks_c-1] int i_startidx_in = 0; - int i_endidx_in = nproma; // Full range: 0 .. nproma-1 + int i_endidx_in = nproma - 1; // Full range: 0 .. nproma-1 int slev = 0; - int elev = nlev; // Full vertical range (0 .. nlev-1) + int elev = nlev - 1; // Full vertical range (0 .. nlev-1) int patch_id = 0; bool lacc = false; // Not using ACC-specific behavior. bool acc_async = false; // No asynchronous execution. bool l_consv = true; // With conservative correction. bool l_limited_area = true; // Limited area setup - std::vector<ValueType> p_cc; - std::vector<int> cell_neighbor_idx; - std::vector<int> cell_neighbor_blk; - std::vector<ValueType> lsq_qtmat_c; - std::vector<ValueType> lsq_rmat_rdiag_c; - std::vector<ValueType> lsq_rmat_utri_c; - std::vector<ValueType> lsq_moments; - std::vector<ValueType> lsq_pseudoinv; - std::vector<ValueType> p_coeff; - - HorizontalReconTest() { - p_cc.resize(dim_combine(nproma, nlev, nblks_c)); - cell_neighbor_idx.resize(dim_combine(nproma, nblks_c, lsq_dim_c)); - cell_neighbor_blk.resize(dim_combine(nproma, nblks_c, lsq_dim_c)); - lsq_qtmat_c.resize(dim_combine(nproma, lsq_dim_unk, lsq_dim_c, nblks_c)); - lsq_rmat_rdiag_c.resize(dim_combine(nproma, lsq_dim_unk, nblks_c)); - lsq_rmat_utri_c.resize(dim_combine( - nproma, (lsq_dim_unk * lsq_dim_unk - lsq_dim_unk) / 2, nblks_c)); - lsq_moments.resize(dim_combine(nproma, nblks_c, lsq_dim_unk)); - lsq_pseudoinv.resize(dim_combine(nproma, lsq_dim_unk, lsq_dim_c, nblks_c)); - p_coeff.resize(dim_combine(lsq_dim_unk + 1, nproma, nlev, nblks_c)); - } + // Using Kokkos execution and memory spaces + using exec_space = Kokkos::DefaultExecutionSpace; + using memory_space = exec_space::memory_space; + + // Kokkos Views for test data + Kokkos::View<ValueType *, memory_space> p_cc; + Kokkos::View<int *, memory_space> cell_neighbor_idx; + Kokkos::View<int *, memory_space> cell_neighbor_blk; + Kokkos::View<ValueType *, memory_space> lsq_qtmat_c; + Kokkos::View<ValueType *, memory_space> lsq_rmat_rdiag_c; + Kokkos::View<ValueType *, memory_space> lsq_rmat_utri_c; + Kokkos::View<ValueType *, memory_space> lsq_moments; + Kokkos::View<ValueType *, memory_space> lsq_pseudoinv; + Kokkos::View<ValueType *, memory_space> p_coeff; + + HorizontalReconTest() + : p_cc("p_cc", dim_combine(nproma, nlev, nblks_c)), + cell_neighbor_idx("cell_neighbor_idx", + dim_combine(nproma, nblks_c, lsq_dim_c)), + cell_neighbor_blk("cell_neighbor_blk", + dim_combine(nproma, nblks_c, lsq_dim_c)), + lsq_qtmat_c("lsq_qtmat_c", + dim_combine(nproma, lsq_dim_unk, lsq_dim_c, nblks_c)), + lsq_rmat_rdiag_c("lsq_rmat_rdiag_c", + dim_combine(nproma, lsq_dim_unk, nblks_c)), + lsq_rmat_utri_c( + "lsq_rmat_utri_c", + dim_combine(nproma, (lsq_dim_unk * lsq_dim_unk - lsq_dim_unk) / 2, + nblks_c)), + lsq_moments("lsq_moments", dim_combine(nproma, nblks_c, lsq_dim_unk)), + lsq_pseudoinv("lsq_pseudoinv", + dim_combine(nproma, lsq_dim_unk, lsq_dim_c, nblks_c)), + p_coeff("p_coeff", + dim_combine(lsq_dim_unk + 1, nproma, nlev, nblks_c)) {} }; /// Test class for the horizontal tests. The reconstruction method is specified @@ -134,30 +148,54 @@ TYPED_TEST(HorizontalReconLinearTest, TestLsqCell) { at<nproma, (lsq_dim_unk * lsq_dim_unk - lsq_dim_unk) / 2, nblks_c>; const auto &moments_at = at<nproma, nblks_c, lsq_dim_unk>; + // Create host mirror views + auto p_cc_h = Kokkos::create_mirror_view(this->p_cc); + auto cell_neighbor_idx_h = + Kokkos::create_mirror_view(this->cell_neighbor_idx); + auto cell_neighbor_blk_h = + Kokkos::create_mirror_view(this->cell_neighbor_blk); + auto lsq_qtmat_c_h = Kokkos::create_mirror_view(this->lsq_qtmat_c); + auto lsq_rmat_rdiag_c_h = Kokkos::create_mirror_view(this->lsq_rmat_rdiag_c); + auto lsq_rmat_utri_c_h = Kokkos::create_mirror_view(this->lsq_rmat_utri_c); + auto lsq_moments_h = Kokkos::create_mirror_view(this->lsq_moments); + auto p_coeff_h = Kokkos::create_mirror_view(this->p_coeff); + // Initialization for (int i = 0; i < nproma; ++i) { - this->p_cc[p_cc_at(i, 0, 0)] = (i + 1); + p_cc_h[p_cc_at(i, 0, 0)] = static_cast<TypeParam>(i + 1); + + cell_neighbor_idx_h[cell_neighbor_at(i, 0, 0)] = (i + 1) % nproma; + cell_neighbor_idx_h[cell_neighbor_at(i, 0, 1)] = i; + cell_neighbor_idx_h[cell_neighbor_at(i, 0, 2)] = i; - this->cell_neighbor_idx[cell_neighbor_at(i, 0, 0)] = (i + 1) % nproma; - this->cell_neighbor_idx[cell_neighbor_at(i, 0, 1)] = i; - this->cell_neighbor_idx[cell_neighbor_at(i, 0, 2)] = i; for (int j = 0; j < lsq_dim_c; ++j) { - this->cell_neighbor_blk[cell_neighbor_at(i, 0, j)] = 0; - this->lsq_qtmat_c[qtmat_at(i, 0, j, 0)] = 1.0; - this->lsq_qtmat_c[qtmat_at(i, 1, j, 0)] = 0.5; + cell_neighbor_blk_h[cell_neighbor_at(i, 0, j)] = 0; + lsq_qtmat_c_h[qtmat_at(i, 0, j, 0)] = static_cast<TypeParam>(1.0); + lsq_qtmat_c_h[qtmat_at(i, 1, j, 0)] = static_cast<TypeParam>(0.5); } + for (int j = 0; j < lsq_dim_unk + 1; ++j) { - this->p_coeff[p_coeff_at(j, i, 0, 0)] = 0.0; + p_coeff_h[p_coeff_at(j, i, 0, 0)] = static_cast<TypeParam>(0.0); } - this->lsq_rmat_rdiag_c[rmat_rdiag_at(i, 0, 0)] = 2.0; - this->lsq_rmat_rdiag_c[rmat_rdiag_at(i, 1, 0)] = 2.0; - this->lsq_rmat_utri_c[rmat_utri_at(i, 0, 0)] = 0.1; + lsq_rmat_rdiag_c_h[rmat_rdiag_at(i, 0, 0)] = static_cast<TypeParam>(2.0); + lsq_rmat_rdiag_c_h[rmat_rdiag_at(i, 1, 0)] = static_cast<TypeParam>(2.0); + lsq_rmat_utri_c_h[rmat_utri_at(i, 0, 0)] = static_cast<TypeParam>(0.1); - this->lsq_moments[moments_at(i, 0, 0)] = 0.2; - this->lsq_moments[moments_at(i, 0, 1)] = 0.3; + lsq_moments_h[moments_at(i, 0, 0)] = static_cast<TypeParam>(0.2); + lsq_moments_h[moments_at(i, 0, 1)] = static_cast<TypeParam>(0.3); } + // Copy to device + Kokkos::deep_copy(this->p_cc, p_cc_h); + Kokkos::deep_copy(this->cell_neighbor_idx, cell_neighbor_idx_h); + Kokkos::deep_copy(this->cell_neighbor_blk, cell_neighbor_blk_h); + Kokkos::deep_copy(this->lsq_qtmat_c, lsq_qtmat_c_h); + Kokkos::deep_copy(this->lsq_rmat_rdiag_c, lsq_rmat_rdiag_c_h); + Kokkos::deep_copy(this->lsq_rmat_utri_c, lsq_rmat_utri_c_h); + Kokkos::deep_copy(this->lsq_moments, lsq_moments_h); + Kokkos::deep_copy(this->p_coeff, p_coeff_h); + // Test function recon_lsq_cell_l<TypeParam>( this->p_cc.data(), this->cell_neighbor_idx.data(), @@ -168,16 +206,19 @@ TYPED_TEST(HorizontalReconLinearTest, TestLsqCell) { this->elev, this->nproma, this->l_consv, this->lacc, this->acc_async, this->nblks_c, this->nlev, this->lsq_dim_unk, this->lsq_dim_c); + // Copy results back to host + Kokkos::deep_copy(p_coeff_h, this->p_coeff); + // Check result EXPECT_NEAR( - this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(0, 0, 0, 0))], - 0.34, 1e-6); + p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(0, 0, 0, 0))], + static_cast<TypeParam>(0.34), 1e-6); EXPECT_NEAR( - this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(1, 0, 0, 0))], - 1.8, 1e-6); + p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(1, 0, 0, 0))], + static_cast<TypeParam>(1.8), 1e-6); EXPECT_NEAR( - this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(2, 0, 0, 0))], - 1.0, 1e-6); + p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(2, 0, 0, 0))], + static_cast<TypeParam>(1.0), 1e-6); } TYPED_TEST(HorizontalReconLinearTest, TestLsqCellRandom) { @@ -196,33 +237,56 @@ TYPED_TEST(HorizontalReconLinearTest, TestLsqCellRandom) { at<nproma, (lsq_dim_unk * lsq_dim_unk - lsq_dim_unk) / 2, nblks_c>; const auto &moments_at = at<nproma, nblks_c, lsq_dim_unk>; + // Create host mirror views + auto p_cc_h = Kokkos::create_mirror_view(this->p_cc); + auto cell_neighbor_idx_h = + Kokkos::create_mirror_view(this->cell_neighbor_idx); + auto cell_neighbor_blk_h = + Kokkos::create_mirror_view(this->cell_neighbor_blk); + auto lsq_qtmat_c_h = Kokkos::create_mirror_view(this->lsq_qtmat_c); + auto lsq_rmat_rdiag_c_h = Kokkos::create_mirror_view(this->lsq_rmat_rdiag_c); + auto lsq_rmat_utri_c_h = Kokkos::create_mirror_view(this->lsq_rmat_utri_c); + auto lsq_moments_h = Kokkos::create_mirror_view(this->lsq_moments); + auto p_coeff_h = Kokkos::create_mirror_view(this->p_coeff); + std::random_device rd; std::mt19937 gen(rd()); std::uniform_int_distribution<int> int_distrib(0, nproma - 1); - std::uniform_real_distribution<TypeParam> real_distrib(0.0, 3.0); + std::uniform_real_distribution<TypeParam> real_distrib(0.0, 1.0); // Initialization for (int i = 0; i < nproma; ++i) { - this->p_cc[p_cc_at(i, 0, 0)] = real_distrib(gen); + p_cc_h[p_cc_at(i, 0, 0)] = real_distrib(gen); for (int j = 0; j < lsq_dim_c; ++j) { - this->cell_neighbor_idx[cell_neighbor_at(i, 0, j)] = int_distrib(gen); - this->cell_neighbor_blk[cell_neighbor_at(i, 0, j)] = 0; - this->lsq_qtmat_c[qtmat_at(i, 0, j, 0)] = real_distrib(gen); - this->lsq_qtmat_c[qtmat_at(i, 1, j, 0)] = real_distrib(gen); + cell_neighbor_idx_h[cell_neighbor_at(i, 0, j)] = int_distrib(gen); + cell_neighbor_blk_h[cell_neighbor_at(i, 0, j)] = 0; + lsq_qtmat_c_h[qtmat_at(i, 0, j, 0)] = real_distrib(gen); + lsq_qtmat_c_h[qtmat_at(i, 1, j, 0)] = real_distrib(gen); } + for (int j = 0; j < lsq_dim_unk + 1; ++j) { - this->p_coeff[p_coeff_at(j, i, 0, 0)] = real_distrib(gen); + p_coeff_h[p_coeff_at(j, i, 0, 0)] = static_cast<TypeParam>(0.0); } - this->lsq_rmat_rdiag_c[rmat_rdiag_at(i, 0, 0)] = real_distrib(gen); - this->lsq_rmat_rdiag_c[rmat_rdiag_at(i, 1, 0)] = real_distrib(gen); - this->lsq_rmat_utri_c[rmat_utri_at(i, 0, 0)] = real_distrib(gen); + lsq_rmat_rdiag_c_h[rmat_rdiag_at(i, 0, 0)] = real_distrib(gen); + lsq_rmat_rdiag_c_h[rmat_rdiag_at(i, 1, 0)] = real_distrib(gen); + lsq_rmat_utri_c_h[rmat_utri_at(i, 0, 0)] = real_distrib(gen); - this->lsq_moments[moments_at(i, 0, 0)] = real_distrib(gen); - this->lsq_moments[moments_at(i, 0, 1)] = real_distrib(gen); + lsq_moments_h[moments_at(i, 0, 0)] = real_distrib(gen); + lsq_moments_h[moments_at(i, 0, 1)] = real_distrib(gen); } + // Copy to device + Kokkos::deep_copy(this->p_cc, p_cc_h); + Kokkos::deep_copy(this->cell_neighbor_idx, cell_neighbor_idx_h); + Kokkos::deep_copy(this->cell_neighbor_blk, cell_neighbor_blk_h); + Kokkos::deep_copy(this->lsq_qtmat_c, lsq_qtmat_c_h); + Kokkos::deep_copy(this->lsq_rmat_rdiag_c, lsq_rmat_rdiag_c_h); + Kokkos::deep_copy(this->lsq_rmat_utri_c, lsq_rmat_utri_c_h); + Kokkos::deep_copy(this->lsq_moments, lsq_moments_h); + Kokkos::deep_copy(this->p_coeff, p_coeff_h); + // Test function recon_lsq_cell_l<TypeParam>( this->p_cc.data(), this->cell_neighbor_idx.data(), @@ -233,50 +297,59 @@ TYPED_TEST(HorizontalReconLinearTest, TestLsqCellRandom) { this->elev, this->nproma, this->l_consv, this->lacc, this->acc_async, this->nblks_c, this->nlev, this->lsq_dim_unk, this->lsq_dim_c); + // Copy results back to host + Kokkos::deep_copy(p_coeff_h, this->p_coeff); + // Compute reference result std::vector<TypeParam> z_d(lsq_dim_c); std::vector<TypeParam> z_qt_times_d(lsq_dim_unk); std::vector<TypeParam> p_result((lsq_dim_unk + 1) * nproma); + + // doing the calculation only for jb=0 for (int jb = this->i_startblk; jb < this->i_endblk; ++jb) { int i_startidx, i_endidx; get_indices_c_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb, this->i_startblk, this->i_endblk, i_startidx, i_endidx); + // calculation only for jk = 0 for (int jk = this->slev; jk < this->elev; ++jk) { - for (int jc = i_startidx; jc < i_endidx; ++jc) { + for (int jc = i_startidx; jc <= i_endidx; ++jc) { for (int i = 0; i < lsq_dim_c; ++i) { - z_d[i] = this->p_cc[p_cc_at( - this->cell_neighbor_idx[cell_neighbor_at(jc, jb, i)], jk, - this->cell_neighbor_blk[cell_neighbor_at(jc, jb, i)])] - - this->p_cc[p_cc_at(jc, jk, jb)]; + z_d[i] = p_cc_h[p_cc_at( + cell_neighbor_idx_h[cell_neighbor_at(jc, jb, i)], jk, + cell_neighbor_blk_h[cell_neighbor_at(jc, jb, i)])] - + p_cc_h[p_cc_at(jc, jk, jb)]; } z_qt_times_d[0] = 0.0; z_qt_times_d[1] = 0.0; for (int i = 0; i < lsq_dim_c; ++i) { - z_qt_times_d[0] += this->lsq_qtmat_c[qtmat_at(jc, 0, i, jb)] * z_d[i]; - z_qt_times_d[1] += this->lsq_qtmat_c[qtmat_at(jc, 1, i, jb)] * z_d[i]; + z_qt_times_d[0] += lsq_qtmat_c_h[qtmat_at(jc, 0, i, jb)] * z_d[i]; + z_qt_times_d[1] += lsq_qtmat_c_h[qtmat_at(jc, 1, i, jb)] * z_d[i]; } p_result[at<lsq_dim_unk + 1, nproma>(2, jc)] = - this->lsq_rmat_rdiag_c[rmat_rdiag_at(jc, 1, jb)] * z_qt_times_d[1]; + lsq_rmat_rdiag_c_h[rmat_rdiag_at(jc, 1, jb)] * z_qt_times_d[1]; p_result[at<lsq_dim_unk + 1, nproma>(1, jc)] = - this->lsq_rmat_rdiag_c[rmat_rdiag_at(jc, 0, jb)] * + lsq_rmat_rdiag_c_h[rmat_rdiag_at(jc, 0, jb)] * (z_qt_times_d[0] - - this->lsq_rmat_utri_c[rmat_utri_at(jc, 0, jb)] * + lsq_rmat_utri_c_h[rmat_utri_at(jc, 0, jb)] * p_result[at<lsq_dim_unk + 1, nproma>(2, jc)]); p_result[at<lsq_dim_unk + 1, nproma>(0, jc)] = - this->p_cc[p_cc_at(jc, jk, jb)] - + p_cc_h[p_cc_at(jc, jk, jb)] - p_result[at<lsq_dim_unk + 1, nproma>(1, jc)] * - this->lsq_moments[moments_at(jc, jb, 0)] - + lsq_moments_h[moments_at(jc, jb, 0)] - p_result[at<lsq_dim_unk + 1, nproma>(2, jc)] * - this->lsq_moments[moments_at(jc, jb, 1)]; + lsq_moments_h[moments_at(jc, jb, 1)]; } } } + TypeParam tol = std::is_same<TypeParam, float>::value ? + static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13); + // Check result for (int i = 0; i < lsq_dim_unk + 1; ++i) { for (int jc = 0; jc < nproma; ++jc) { - EXPECT_NEAR(this->p_coeff[(p_coeff_at(i, jc, 0, 0))], - p_result[(at<lsq_dim_unk + 1, nproma>(i, jc))], 1e-5) + EXPECT_NEAR(p_coeff_h[(p_coeff_at(i, jc, 0, 0))], + p_result[(at<lsq_dim_unk + 1, nproma>(i, jc))], tol) << "For loop result fails for i = " << i << ", jc = " << jc; } } @@ -295,26 +368,46 @@ TYPED_TEST(HorizontalReconLinearTest, TestLsqCellSVD) { const auto &p_coeff_at = at<lsq_dim_unk + 1, nproma, nlev, nblks_c>; const auto &moments_at = at<nproma, nblks_c, lsq_dim_unk>; + // Create host mirror views + auto p_cc_h = Kokkos::create_mirror_view(this->p_cc); + auto cell_neighbor_idx_h = + Kokkos::create_mirror_view(this->cell_neighbor_idx); + auto cell_neighbor_blk_h = + Kokkos::create_mirror_view(this->cell_neighbor_blk); + auto lsq_pseudoinv_h = Kokkos::create_mirror_view(this->lsq_pseudoinv); + auto lsq_moments_h = Kokkos::create_mirror_view(this->lsq_moments); + auto p_coeff_h = Kokkos::create_mirror_view(this->p_coeff); + // Initialization for (int i = 0; i < nproma; ++i) { - this->p_cc[p_cc_at(i, 0, 0)] = (i + 1); + p_cc_h[p_cc_at(i, 0, 0)] = static_cast<TypeParam>(i + 1); + + cell_neighbor_idx_h[cell_neighbor_at(i, 0, 0)] = (i + 1) % nproma; + cell_neighbor_idx_h[cell_neighbor_at(i, 0, 1)] = i; + cell_neighbor_idx_h[cell_neighbor_at(i, 0, 2)] = i; - this->cell_neighbor_idx[cell_neighbor_at(i, 0, 0)] = (i + 1) % nproma; - this->cell_neighbor_idx[cell_neighbor_at(i, 0, 1)] = i; - this->cell_neighbor_idx[cell_neighbor_at(i, 0, 2)] = i; for (int j = 0; j < lsq_dim_c; ++j) { - this->cell_neighbor_blk[cell_neighbor_at(i, 0, j)] = 0; - this->lsq_pseudoinv[pseudoinv_at(i, 0, j, 0)] = 1.0; - this->lsq_pseudoinv[pseudoinv_at(i, 1, j, 0)] = 0.5; + cell_neighbor_blk_h[cell_neighbor_at(i, 0, j)] = 0; + lsq_pseudoinv_h[pseudoinv_at(i, 0, j, 0)] = static_cast<TypeParam>(1.0); + lsq_pseudoinv_h[pseudoinv_at(i, 1, j, 0)] = static_cast<TypeParam>(0.5); } + for (int j = 0; j < lsq_dim_unk + 1; ++j) { - this->p_coeff[p_coeff_at(j, i, 0, 0)] = 0.0; + p_coeff_h[p_coeff_at(j, i, 0, 0)] = static_cast<TypeParam>(0.0); } - this->lsq_moments[moments_at(i, 0, 0)] = 0.2; - this->lsq_moments[moments_at(i, 0, 1)] = 0.3; + lsq_moments_h[moments_at(i, 0, 0)] = static_cast<TypeParam>(0.2); + lsq_moments_h[moments_at(i, 0, 1)] = static_cast<TypeParam>(0.3); } + // Copy to device + Kokkos::deep_copy(this->p_cc, p_cc_h); + Kokkos::deep_copy(this->cell_neighbor_idx, cell_neighbor_idx_h); + Kokkos::deep_copy(this->cell_neighbor_blk, cell_neighbor_blk_h); + Kokkos::deep_copy(this->lsq_pseudoinv, lsq_pseudoinv_h); + Kokkos::deep_copy(this->lsq_moments, lsq_moments_h); + Kokkos::deep_copy(this->p_coeff, p_coeff_h); + // Test function recon_lsq_cell_l_svd<TypeParam>( this->p_cc.data(), this->cell_neighbor_idx.data(), @@ -324,16 +417,19 @@ TYPED_TEST(HorizontalReconLinearTest, TestLsqCellSVD) { this->elev, this->nproma, this->l_consv, this->lacc, this->acc_async, this->nblks_c, this->nlev, this->lsq_dim_unk, this->lsq_dim_c); + // Copy results back to host + Kokkos::deep_copy(p_coeff_h, this->p_coeff); + // Check result EXPECT_NEAR( - this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(0, 0, 0, 0))], - 0.65, 1e-6); + p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(0, 0, 0, 0))], + static_cast<TypeParam>(0.65), 1e-6); EXPECT_NEAR( - this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(1, 0, 0, 0))], - 1.0, 1e-6); + p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(1, 0, 0, 0))], + static_cast<TypeParam>(1.0), 1e-6); EXPECT_NEAR( - this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(2, 0, 0, 0))], - 0.5, 1e-6); + p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(2, 0, 0, 0))], + static_cast<TypeParam>(0.5), 1e-6); } TYPED_TEST(HorizontalReconLinearTest, TestLsqCellSVDRandom) { @@ -349,29 +445,48 @@ TYPED_TEST(HorizontalReconLinearTest, TestLsqCellSVDRandom) { const auto &p_coeff_at = at<lsq_dim_unk + 1, nproma, nlev, nblks_c>; const auto &moments_at = at<nproma, nblks_c, lsq_dim_unk>; + // Create host mirror views + auto p_cc_h = Kokkos::create_mirror_view(this->p_cc); + auto cell_neighbor_idx_h = + Kokkos::create_mirror_view(this->cell_neighbor_idx); + auto cell_neighbor_blk_h = + Kokkos::create_mirror_view(this->cell_neighbor_blk); + auto lsq_pseudoinv_h = Kokkos::create_mirror_view(this->lsq_pseudoinv); + auto lsq_moments_h = Kokkos::create_mirror_view(this->lsq_moments); + auto p_coeff_h = Kokkos::create_mirror_view(this->p_coeff); + std::random_device rd; std::mt19937 gen(rd()); std::uniform_int_distribution<int> int_distrib(0, nproma - 1); - std::uniform_real_distribution<TypeParam> real_distrib(0.0, 3.0); + std::uniform_real_distribution<TypeParam> real_distrib(0.0, 1.0); // Initialization for (int i = 0; i < nproma; ++i) { - this->p_cc[p_cc_at(i, 0, 0)] = real_distrib(gen); + p_cc_h[p_cc_at(i, 0, 0)] = real_distrib(gen); for (int j = 0; j < lsq_dim_c; ++j) { - this->cell_neighbor_idx[cell_neighbor_at(i, 0, j)] = int_distrib(gen); - this->cell_neighbor_blk[cell_neighbor_at(i, 0, j)] = 0; - this->lsq_pseudoinv[pseudoinv_at(i, 0, j, 0)] = real_distrib(gen); - this->lsq_pseudoinv[pseudoinv_at(i, 1, j, 0)] = real_distrib(gen); + cell_neighbor_idx_h[cell_neighbor_at(i, 0, j)] = int_distrib(gen); + cell_neighbor_blk_h[cell_neighbor_at(i, 0, j)] = 0; + lsq_pseudoinv_h[pseudoinv_at(i, 0, j, 0)] = real_distrib(gen); + lsq_pseudoinv_h[pseudoinv_at(i, 1, j, 0)] = real_distrib(gen); } + for (int j = 0; j < lsq_dim_unk + 1; ++j) { - this->p_coeff[p_coeff_at(j, i, 0, 0)] = real_distrib(gen); + p_coeff_h[p_coeff_at(j, i, 0, 0)] = static_cast<TypeParam>(0.0); } - this->lsq_moments[moments_at(i, 0, 0)] = real_distrib(gen); - this->lsq_moments[moments_at(i, 0, 1)] = real_distrib(gen); + lsq_moments_h[moments_at(i, 0, 0)] = real_distrib(gen); + lsq_moments_h[moments_at(i, 0, 1)] = real_distrib(gen); } + // Copy to device + Kokkos::deep_copy(this->p_cc, p_cc_h); + Kokkos::deep_copy(this->cell_neighbor_idx, cell_neighbor_idx_h); + Kokkos::deep_copy(this->cell_neighbor_blk, cell_neighbor_blk_h); + Kokkos::deep_copy(this->lsq_pseudoinv, lsq_pseudoinv_h); + Kokkos::deep_copy(this->lsq_moments, lsq_moments_h); + Kokkos::deep_copy(this->p_coeff, p_coeff_h); + // Test function recon_lsq_cell_l_svd<TypeParam>( this->p_cc.data(), this->cell_neighbor_idx.data(), @@ -381,45 +496,53 @@ TYPED_TEST(HorizontalReconLinearTest, TestLsqCellSVDRandom) { this->elev, this->nproma, this->l_consv, this->lacc, this->acc_async, this->nblks_c, this->nlev, this->lsq_dim_unk, this->lsq_dim_c); + // Copy results back to host + Kokkos::deep_copy(p_coeff_h, this->p_coeff); + // Compute reference result std::vector<TypeParam> z_d(lsq_dim_c); std::vector<TypeParam> p_result((lsq_dim_unk + 1) * nproma); + // doing the calculation only for jb=0 for (int jb = this->i_startblk; jb < this->i_endblk; ++jb) { int i_startidx, i_endidx; get_indices_c_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb, this->i_startblk, this->i_endblk, i_startidx, i_endidx); + // calculation only for jk = 0 for (int jk = this->slev; jk < this->elev; ++jk) { - for (int jc = i_startidx; jc < i_endidx; ++jc) { + for (int jc = i_startidx; jc <= i_endidx; ++jc) { for (int i = 0; i < lsq_dim_c; ++i) { - z_d[i] = this->p_cc[p_cc_at( - this->cell_neighbor_idx[cell_neighbor_at(jc, jb, i)], jk, - this->cell_neighbor_blk[cell_neighbor_at(jc, jb, i)])] - - this->p_cc[p_cc_at(jc, jk, jb)]; + z_d[i] = p_cc_h[p_cc_at( + cell_neighbor_idx_h[cell_neighbor_at(jc, jb, i)], jk, + cell_neighbor_blk_h[cell_neighbor_at(jc, jb, i)])] - + p_cc_h[p_cc_at(jc, jk, jb)]; } p_result[at<lsq_dim_unk + 1, nproma>(2, jc)] = - this->lsq_pseudoinv[pseudoinv_at(jc, 1, 0, jb)] * z_d[0] + - this->lsq_pseudoinv[pseudoinv_at(jc, 1, 1, jb)] * z_d[1] + - this->lsq_pseudoinv[pseudoinv_at(jc, 1, 2, jb)] * z_d[2]; + lsq_pseudoinv_h[pseudoinv_at(jc, 1, 0, jb)] * z_d[0] + + lsq_pseudoinv_h[pseudoinv_at(jc, 1, 1, jb)] * z_d[1] + + lsq_pseudoinv_h[pseudoinv_at(jc, 1, 2, jb)] * z_d[2]; p_result[at<lsq_dim_unk + 1, nproma>(1, jc)] = - this->lsq_pseudoinv[pseudoinv_at(jc, 0, 0, jb)] * z_d[0] + - this->lsq_pseudoinv[pseudoinv_at(jc, 0, 1, jb)] * z_d[1] + - this->lsq_pseudoinv[pseudoinv_at(jc, 0, 2, jb)] * z_d[2]; + lsq_pseudoinv_h[pseudoinv_at(jc, 0, 0, jb)] * z_d[0] + + lsq_pseudoinv_h[pseudoinv_at(jc, 0, 1, jb)] * z_d[1] + + lsq_pseudoinv_h[pseudoinv_at(jc, 0, 2, jb)] * z_d[2]; p_result[at<lsq_dim_unk + 1, nproma>(0, jc)] = p_result[at<lsq_dim_unk + 1, nproma>(0, jc)] = - this->p_cc[p_cc_at(jc, jk, jb)] - + p_cc_h[p_cc_at(jc, jk, jb)] - p_result[at<lsq_dim_unk + 1, nproma>(1, jc)] * - this->lsq_moments[moments_at(jc, jb, 0)] - + lsq_moments_h[moments_at(jc, jb, 0)] - p_result[at<lsq_dim_unk + 1, nproma>(2, jc)] * - this->lsq_moments[moments_at(jc, jb, 1)]; + lsq_moments_h[moments_at(jc, jb, 1)]; } } } + TypeParam tol = std::is_same<TypeParam, float>::value ? + static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13); + // Check result for (int i = 0; i < lsq_dim_unk + 1; ++i) { for (int jc = 0; jc < nproma; ++jc) { - EXPECT_NEAR(this->p_coeff[(p_coeff_at(i, jc, 0, 0))], - p_result[(at<lsq_dim_unk + 1, nproma>(i, jc))], 1e-5) + EXPECT_NEAR(p_coeff_h[(p_coeff_at(i, jc, 0, 0))], + p_result[(at<lsq_dim_unk + 1, nproma>(i, jc))], tol) << "For loop result fails for i = " << i << ", jc = " << jc; } } @@ -443,43 +566,65 @@ TYPED_TEST(HorizontalReconQuadraticTest, TestLsqCell) { at<nproma, (lsq_dim_unk * lsq_dim_unk - lsq_dim_unk) / 2, nblks_c>; const auto &moments_at = at<nproma, nblks_c, lsq_dim_unk>; + // Create host mirror views + auto p_cc_h = Kokkos::create_mirror_view(this->p_cc); + auto cell_neighbor_idx_h = + Kokkos::create_mirror_view(this->cell_neighbor_idx); + auto cell_neighbor_blk_h = + Kokkos::create_mirror_view(this->cell_neighbor_blk); + auto lsq_qtmat_c_h = Kokkos::create_mirror_view(this->lsq_qtmat_c); + auto lsq_rmat_rdiag_c_h = Kokkos::create_mirror_view(this->lsq_rmat_rdiag_c); + auto lsq_rmat_utri_c_h = Kokkos::create_mirror_view(this->lsq_rmat_utri_c); + auto lsq_moments_h = Kokkos::create_mirror_view(this->lsq_moments); + auto p_coeff_h = Kokkos::create_mirror_view(this->p_coeff); + // Initialization for (int i = 0; i < nproma; ++i) { - this->p_cc[p_cc_at(i, 0, 0)] = (i + 1); + p_cc_h[p_cc_at(i, 0, 0)] = static_cast<TypeParam>(i + 1); - this->cell_neighbor_idx[cell_neighbor_at(i, 0, 0)] = (i + 1) % nproma; - this->cell_neighbor_blk[cell_neighbor_at(i, 0, 0)] = 0; + cell_neighbor_idx_h[cell_neighbor_at(i, 0, 0)] = (i + 1) % nproma; + cell_neighbor_blk_h[cell_neighbor_at(i, 0, 0)] = 0; for (int j = 1; j < lsq_dim_c; ++j) { - this->cell_neighbor_idx[cell_neighbor_at(i, 0, j)] = i; - this->cell_neighbor_blk[cell_neighbor_at(i, 0, j)] = 0; + cell_neighbor_idx_h[cell_neighbor_at(i, 0, j)] = i; + cell_neighbor_blk_h[cell_neighbor_at(i, 0, j)] = 0; } for (int j = 0; j < lsq_dim_c; ++j) { - this->lsq_qtmat_c[qtmat_at(i, 0, j, 0)] = 1.0; - this->lsq_qtmat_c[qtmat_at(i, 1, j, 0)] = 0.5; - this->lsq_qtmat_c[qtmat_at(i, 2, j, 0)] = 0.2; - this->lsq_qtmat_c[qtmat_at(i, 3, j, 0)] = 0.7; - this->lsq_qtmat_c[qtmat_at(i, 4, j, 0)] = 1.3; + lsq_qtmat_c_h[qtmat_at(i, 0, j, 0)] = static_cast<TypeParam>(1.0); + lsq_qtmat_c_h[qtmat_at(i, 1, j, 0)] = static_cast<TypeParam>(0.5); + lsq_qtmat_c_h[qtmat_at(i, 2, j, 0)] = static_cast<TypeParam>(0.2); + lsq_qtmat_c_h[qtmat_at(i, 3, j, 0)] = static_cast<TypeParam>(0.7); + lsq_qtmat_c_h[qtmat_at(i, 4, j, 0)] = static_cast<TypeParam>(1.3); } for (int j = 0; j < lsq_dim_unk + 1; ++j) { - this->p_coeff[p_coeff_at(j, i, 0, 0)] = 0.0; + p_coeff_h[p_coeff_at(j, i, 0, 0)] = static_cast<TypeParam>(0.0); } for (int j = 0; j < lsq_dim_unk; ++j) { - this->lsq_rmat_rdiag_c[rmat_rdiag_at(i, j, 0)] = 2.0; + lsq_rmat_rdiag_c_h[rmat_rdiag_at(i, j, 0)] = static_cast<TypeParam>(2.0); } for (int j = 0; j < (lsq_dim_unk * lsq_dim_unk - lsq_dim_unk) / 2; ++j) { - this->lsq_rmat_utri_c[rmat_utri_at(i, j, 0)] = 1.0; + lsq_rmat_utri_c_h[rmat_utri_at(i, j, 0)] = static_cast<TypeParam>(1.0); } - this->lsq_moments[moments_at(i, 0, 0)] = 0.2; - this->lsq_moments[moments_at(i, 0, 1)] = 0.3; - this->lsq_moments[moments_at(i, 0, 2)] = 0.4; - this->lsq_moments[moments_at(i, 0, 3)] = 0.5; - this->lsq_moments[moments_at(i, 0, 4)] = 0.6; + lsq_moments_h[moments_at(i, 0, 0)] = static_cast<TypeParam>(0.2); + lsq_moments_h[moments_at(i, 0, 1)] = static_cast<TypeParam>(0.3); + lsq_moments_h[moments_at(i, 0, 2)] = static_cast<TypeParam>(0.4); + lsq_moments_h[moments_at(i, 0, 3)] = static_cast<TypeParam>(0.5); + lsq_moments_h[moments_at(i, 0, 4)] = static_cast<TypeParam>(0.6); } + // Copy to device + Kokkos::deep_copy(this->p_cc, p_cc_h); + Kokkos::deep_copy(this->cell_neighbor_idx, cell_neighbor_idx_h); + Kokkos::deep_copy(this->cell_neighbor_blk, cell_neighbor_blk_h); + Kokkos::deep_copy(this->lsq_qtmat_c, lsq_qtmat_c_h); + Kokkos::deep_copy(this->lsq_rmat_rdiag_c, lsq_rmat_rdiag_c_h); + Kokkos::deep_copy(this->lsq_rmat_utri_c, lsq_rmat_utri_c_h); + Kokkos::deep_copy(this->lsq_moments, lsq_moments_h); + Kokkos::deep_copy(this->p_coeff, p_coeff_h); + // Test function recon_lsq_cell_q<TypeParam>( this->p_cc.data(), this->cell_neighbor_idx.data(), @@ -491,25 +636,28 @@ TYPED_TEST(HorizontalReconQuadraticTest, TestLsqCell) { this->lacc, this->nblks_c, this->nlev, this->lsq_dim_unk, this->lsq_dim_c); + // Copy results back to host + Kokkos::deep_copy(p_coeff_h, this->p_coeff); + // Check result EXPECT_NEAR( - this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(0, 0, 0, 0))], - 0.24, 1e-6); + p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(0, 0, 0, 0))], + static_cast<TypeParam>(0.24), 1e-6); EXPECT_NEAR( - this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(1, 0, 0, 0))], - 3.2, 1e-6); + p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(1, 0, 0, 0))], + static_cast<TypeParam>(3.2), 1e-6); EXPECT_NEAR( - this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(2, 0, 0, 0))], - -2.2, 1e-6); + p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(2, 0, 0, 0))], + static_cast<TypeParam>(-2.2), 1e-6); EXPECT_NEAR( - this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(3, 0, 0, 0))], - 2.8, 1e-6); + p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(3, 0, 0, 0))], + static_cast<TypeParam>(2.8), 1e-6); EXPECT_NEAR( - this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(4, 0, 0, 0))], - -3.8, 1e-6); + p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(4, 0, 0, 0))], + static_cast<TypeParam>(-3.8), 1e-6); EXPECT_NEAR( - this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(5, 0, 0, 0))], - 2.6, 1e-6); + p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(5, 0, 0, 0))], + static_cast<TypeParam>(2.6), 1e-6); } TYPED_TEST(HorizontalReconQuadraticTest, TestLsqCellRandom) { @@ -528,6 +676,19 @@ TYPED_TEST(HorizontalReconQuadraticTest, TestLsqCellRandom) { at<nproma, (lsq_dim_unk * lsq_dim_unk - lsq_dim_unk) / 2, nblks_c>; const auto &moments_at = at<nproma, nblks_c, lsq_dim_unk>; + // Create host mirror views + auto p_cc_h = Kokkos::create_mirror_view(this->p_cc); + auto cell_neighbor_idx_h = + Kokkos::create_mirror_view(this->cell_neighbor_idx); + auto cell_neighbor_blk_h = + Kokkos::create_mirror_view(this->cell_neighbor_blk); + auto lsq_qtmat_c_h = Kokkos::create_mirror_view(this->lsq_qtmat_c); + auto lsq_rmat_rdiag_c_h = Kokkos::create_mirror_view(this->lsq_rmat_rdiag_c); + auto lsq_rmat_utri_c_h = Kokkos::create_mirror_view(this->lsq_rmat_utri_c); + auto lsq_moments_h = Kokkos::create_mirror_view(this->lsq_moments); + auto p_coeff_h = Kokkos::create_mirror_view(this->p_coeff); + + // Use fixed seed for reproducibility std::random_device rd; std::mt19937 gen(rd()); std::uniform_int_distribution<int> int_distrib(0, nproma - 1); @@ -535,29 +696,39 @@ TYPED_TEST(HorizontalReconQuadraticTest, TestLsqCellRandom) { // Initialization for (int i = 0; i < nproma; ++i) { - this->p_cc[p_cc_at(i, 0, 0)] = real_distrib(gen); + p_cc_h[p_cc_at(i, 0, 0)] = real_distrib(gen); for (int j = 0; j < lsq_dim_c; ++j) { - this->cell_neighbor_idx[cell_neighbor_at(i, 0, j)] = int_distrib(gen); - this->cell_neighbor_blk[cell_neighbor_at(i, 0, j)] = 0; + cell_neighbor_idx_h[cell_neighbor_at(i, 0, j)] = int_distrib(gen); + cell_neighbor_blk_h[cell_neighbor_at(i, 0, j)] = 0; } for (int j = 0; j < lsq_dim_unk; ++j) { for (int k = 0; k < lsq_dim_c; ++k) { - this->lsq_qtmat_c[qtmat_at(i, j, k, 0)] = real_distrib(gen); + lsq_qtmat_c_h[qtmat_at(i, j, k, 0)] = real_distrib(gen); } - this->lsq_rmat_rdiag_c[rmat_rdiag_at(i, j, 0)] = real_distrib(gen); - this->lsq_moments[moments_at(i, 0, j)] = real_distrib(gen); + lsq_rmat_rdiag_c_h[rmat_rdiag_at(i, j, 0)] = real_distrib(gen); + lsq_moments_h[moments_at(i, 0, j)] = real_distrib(gen); } for (int j = 0; j < (lsq_dim_unk * lsq_dim_unk - lsq_dim_unk) / 2; ++j) { - this->lsq_rmat_utri_c[rmat_utri_at(i, j, 0)] = real_distrib(gen); + lsq_rmat_utri_c_h[rmat_utri_at(i, j, 0)] = real_distrib(gen); } for (int j = 0; j < lsq_dim_unk + 1; ++j) { - this->p_coeff[p_coeff_at(j, i, 0, 0)] = real_distrib(gen); + p_coeff_h[p_coeff_at(j, i, 0, 0)] = static_cast<TypeParam>(0.0); } } + // Copy to device + Kokkos::deep_copy(this->p_cc, p_cc_h); + Kokkos::deep_copy(this->cell_neighbor_idx, cell_neighbor_idx_h); + Kokkos::deep_copy(this->cell_neighbor_blk, cell_neighbor_blk_h); + Kokkos::deep_copy(this->lsq_qtmat_c, lsq_qtmat_c_h); + Kokkos::deep_copy(this->lsq_rmat_rdiag_c, lsq_rmat_rdiag_c_h); + Kokkos::deep_copy(this->lsq_rmat_utri_c, lsq_rmat_utri_c_h); + Kokkos::deep_copy(this->lsq_moments, lsq_moments_h); + Kokkos::deep_copy(this->p_coeff, p_coeff_h); + // Test function recon_lsq_cell_q<TypeParam>( this->p_cc.data(), this->cell_neighbor_idx.data(), @@ -569,56 +740,91 @@ TYPED_TEST(HorizontalReconQuadraticTest, TestLsqCellRandom) { this->lacc, this->nblks_c, this->nlev, this->lsq_dim_unk, this->lsq_dim_c); + // Copy results back to host + Kokkos::deep_copy(p_coeff_h, this->p_coeff); + // Compute reference result std::vector<TypeParam> z_d(lsq_dim_c); std::vector<TypeParam> z_qt_times_d(lsq_dim_unk); std::vector<TypeParam> p_result((lsq_dim_unk + 1) * nproma); + + for (int i = 0; i < nproma; ++i) { + for (int j = 0; j < lsq_dim_unk + 1; ++j) { + p_result[(at<lsq_dim_unk + 1, nproma>(j, i))] = static_cast<TypeParam>(0.0); + } + } + + // doing the calculation only for jb=0 for (int jb = this->i_startblk; jb < this->i_endblk; ++jb) { int i_startidx, i_endidx; get_indices_c_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb, this->i_startblk, this->i_endblk, i_startidx, i_endidx); + + // Step 1: Calculate z_d values for (int jk = this->slev; jk < this->elev; ++jk) { - for (int jc = i_startidx; jc < i_endidx; ++jc) { + for (int jc = i_startidx; jc <= i_endidx; ++jc) { for (int i = 0; i < lsq_dim_c; ++i) { - z_d[i] = this->p_cc[p_cc_at( - this->cell_neighbor_idx[cell_neighbor_at(jc, jb, i)], jk, - this->cell_neighbor_blk[cell_neighbor_at(jc, jb, i)])] - - this->p_cc[p_cc_at(jc, jk, jb)]; + z_d[i] = p_cc_h[p_cc_at( + cell_neighbor_idx_h[cell_neighbor_at(jc, jb, i)], jk, + cell_neighbor_blk_h[cell_neighbor_at(jc, jb, i)])] - + p_cc_h[p_cc_at(jc, jk, jb)]; } + + // Matrix multiplication (Q^T * d) for (int j = 0; j < lsq_dim_unk; ++j) { z_qt_times_d[j] = 0.0; for (int i = 0; i < lsq_dim_c; ++i) { z_qt_times_d[j] += - this->lsq_qtmat_c[qtmat_at(jc, j, i, jb)] * z_d[i]; - } - } - int utri_id = 0; - for (int j = lsq_dim_unk; j > 0; --j) { - p_result[at<lsq_dim_unk + 1, nproma>(j, jc)] = z_qt_times_d[j - 1]; - for (int k = j + 1; k <= lsq_dim_unk; ++k) { - p_result[at<lsq_dim_unk + 1, nproma>(j, jc)] -= - this->lsq_rmat_utri_c[rmat_utri_at(jc, utri_id++, jb)] * - p_result[at<lsq_dim_unk + 1, nproma>(k, jc)]; + lsq_qtmat_c_h[qtmat_at(jc, j, i, jb)] * z_d[i]; } - p_result[at<lsq_dim_unk + 1, nproma>(j, jc)] *= - this->lsq_rmat_rdiag_c[rmat_rdiag_at(jc, j - 1, jb)]; } + + // Back-substitution (mirrors the order in the GPU implementation) + p_result[at<lsq_dim_unk + 1, nproma>(5, jc)] = + lsq_rmat_rdiag_c_h[rmat_rdiag_at(jc, 4, jb)] * z_qt_times_d[4]; + + p_result[at<lsq_dim_unk + 1, nproma>(4, jc)] = + lsq_rmat_rdiag_c_h[rmat_rdiag_at(jc, 3, jb)] * + (z_qt_times_d[3] - + lsq_rmat_utri_c_h[rmat_utri_at(jc, 0, jb)] * p_result[at<lsq_dim_unk + 1, nproma>(5, jc)]); + p_result[at<lsq_dim_unk + 1, nproma>(3, jc)] = + lsq_rmat_rdiag_c_h[rmat_rdiag_at(jc, 2, jb)] * + (z_qt_times_d[2] - + lsq_rmat_utri_c_h[rmat_utri_at(jc, 1, jb)] * p_result[at<lsq_dim_unk + 1, nproma>(4, jc)] - + lsq_rmat_utri_c_h[rmat_utri_at(jc, 2, jb)] * p_result[at<lsq_dim_unk + 1, nproma>(5, jc)]); + p_result[at<lsq_dim_unk + 1, nproma>(2, jc)] = + lsq_rmat_rdiag_c_h[rmat_rdiag_at(jc, 1, jb)] * + (z_qt_times_d[1] - + lsq_rmat_utri_c_h[rmat_utri_at(jc, 3, jb)] * p_result[at<lsq_dim_unk + 1, nproma>(3, jc)] - + lsq_rmat_utri_c_h[rmat_utri_at(jc, 4, jb)] * p_result[at<lsq_dim_unk + 1, nproma>(4, jc)] - + lsq_rmat_utri_c_h[rmat_utri_at(jc, 5, jb)] * p_result[at<lsq_dim_unk + 1, nproma>(5, jc)]); + p_result[at<lsq_dim_unk + 1, nproma>(1, jc)] = + lsq_rmat_rdiag_c_h[rmat_rdiag_at(jc, 0, jb)] * + (z_qt_times_d[0] - + lsq_rmat_utri_c_h[rmat_utri_at(jc, 6, jb)] * p_result[at<lsq_dim_unk + 1, nproma>(2, jc)] - + lsq_rmat_utri_c_h[rmat_utri_at(jc, 7, jb)] * p_result[at<lsq_dim_unk + 1, nproma>(3, jc)] - + lsq_rmat_utri_c_h[rmat_utri_at(jc, 8, jb)] * p_result[at<lsq_dim_unk + 1, nproma>(4, jc)] - + lsq_rmat_utri_c_h[rmat_utri_at(jc, 9, jb)] * p_result[at<lsq_dim_unk + 1, nproma>(5, jc)]); + // Conservation correction p_result[at<lsq_dim_unk + 1, nproma>(0, jc)] = - this->p_cc[p_cc_at(jc, jk, jb)]; + p_cc_h[p_cc_at(jc, jk, jb)]; for (int j = 0; j < lsq_dim_unk; ++j) { p_result[at<lsq_dim_unk + 1, nproma>(0, jc)] -= p_result[at<lsq_dim_unk + 1, nproma>(j + 1, jc)] * - this->lsq_moments[moments_at(jc, jb, j)]; + lsq_moments_h[moments_at(jc, jb, j)]; } } } } + TypeParam tol = std::is_same<TypeParam, float>::value ? + static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13); + // Check result for (int i = 0; i < lsq_dim_unk + 1; ++i) { for (int jc = 0; jc < nproma; ++jc) { - EXPECT_NEAR(this->p_coeff[(p_coeff_at(i, jc, 0, 0))], - p_result[(at<lsq_dim_unk + 1, nproma>(i, jc))], 1e-5) + EXPECT_NEAR(p_coeff_h[(p_coeff_at(i, jc, 0, 0))], + p_result[(at<lsq_dim_unk + 1, nproma>(i, jc))], tol) << "For loop result fails for i = " << i << ", jc = " << jc; } } @@ -637,35 +843,53 @@ TYPED_TEST(HorizontalReconQuadraticTest, TestLsqCellSVD) { const auto &p_coeff_at = at<lsq_dim_unk + 1, nproma, nlev, nblks_c>; const auto &moments_at = at<nproma, nblks_c, lsq_dim_unk>; + // Create host mirror views + auto p_cc_h = Kokkos::create_mirror_view(this->p_cc); + auto cell_neighbor_idx_h = + Kokkos::create_mirror_view(this->cell_neighbor_idx); + auto cell_neighbor_blk_h = + Kokkos::create_mirror_view(this->cell_neighbor_blk); + auto lsq_pseudoinv_h = Kokkos::create_mirror_view(this->lsq_pseudoinv); + auto lsq_moments_h = Kokkos::create_mirror_view(this->lsq_moments); + auto p_coeff_h = Kokkos::create_mirror_view(this->p_coeff); + // Initialization for (int i = 0; i < nproma; ++i) { - this->p_cc[p_cc_at(i, 0, 0)] = (i + 1); + p_cc_h[p_cc_at(i, 0, 0)] = static_cast<TypeParam>(i + 1); - this->cell_neighbor_idx[cell_neighbor_at(i, 0, 0)] = (i + 1) % nproma; - this->cell_neighbor_blk[cell_neighbor_at(i, 0, 0)] = 0; + cell_neighbor_idx_h[cell_neighbor_at(i, 0, 0)] = (i + 1) % nproma; + cell_neighbor_blk_h[cell_neighbor_at(i, 0, 0)] = 0; for (int j = 1; j < lsq_dim_c; ++j) { - this->cell_neighbor_idx[cell_neighbor_at(i, 0, j)] = i; - this->cell_neighbor_blk[cell_neighbor_at(i, 0, j)] = 0; + cell_neighbor_idx_h[cell_neighbor_at(i, 0, j)] = i; + cell_neighbor_blk_h[cell_neighbor_at(i, 0, j)] = 0; } for (int j = 0; j < lsq_dim_c; ++j) { - this->lsq_pseudoinv[pseudoinv_at(i, 0, j, 0)] = 1.0; - this->lsq_pseudoinv[pseudoinv_at(i, 1, j, 0)] = 0.5; - this->lsq_pseudoinv[pseudoinv_at(i, 2, j, 0)] = 0.2; - this->lsq_pseudoinv[pseudoinv_at(i, 3, j, 0)] = 0.7; - this->lsq_pseudoinv[pseudoinv_at(i, 4, j, 0)] = 1.3; + lsq_pseudoinv_h[pseudoinv_at(i, 0, j, 0)] = static_cast<TypeParam>(1.0); + lsq_pseudoinv_h[pseudoinv_at(i, 1, j, 0)] = static_cast<TypeParam>(0.5); + lsq_pseudoinv_h[pseudoinv_at(i, 2, j, 0)] = static_cast<TypeParam>(0.2); + lsq_pseudoinv_h[pseudoinv_at(i, 3, j, 0)] = static_cast<TypeParam>(0.7); + lsq_pseudoinv_h[pseudoinv_at(i, 4, j, 0)] = static_cast<TypeParam>(1.3); } for (int j = 0; j < lsq_dim_unk + 1; ++j) { - this->p_coeff[p_coeff_at(j, i, 0, 0)] = 0.0; + p_coeff_h[p_coeff_at(j, i, 0, 0)] = static_cast<TypeParam>(0.0); } - this->lsq_moments[moments_at(i, 0, 0)] = 0.2; - this->lsq_moments[moments_at(i, 0, 1)] = 0.3; - this->lsq_moments[moments_at(i, 0, 2)] = 0.4; - this->lsq_moments[moments_at(i, 0, 3)] = 0.5; - this->lsq_moments[moments_at(i, 0, 4)] = 0.6; + lsq_moments_h[moments_at(i, 0, 0)] = static_cast<TypeParam>(0.2); + lsq_moments_h[moments_at(i, 0, 1)] = static_cast<TypeParam>(0.3); + lsq_moments_h[moments_at(i, 0, 2)] = static_cast<TypeParam>(0.4); + lsq_moments_h[moments_at(i, 0, 3)] = static_cast<TypeParam>(0.5); + lsq_moments_h[moments_at(i, 0, 4)] = static_cast<TypeParam>(0.6); } + // Copy to device + Kokkos::deep_copy(this->p_cc, p_cc_h); + Kokkos::deep_copy(this->cell_neighbor_idx, cell_neighbor_idx_h); + Kokkos::deep_copy(this->cell_neighbor_blk, cell_neighbor_blk_h); + Kokkos::deep_copy(this->lsq_pseudoinv, lsq_pseudoinv_h); + Kokkos::deep_copy(this->lsq_moments, lsq_moments_h); + Kokkos::deep_copy(this->p_coeff, p_coeff_h); + // Test function recon_lsq_cell_q_svd<TypeParam>( this->p_cc.data(), this->cell_neighbor_idx.data(), @@ -676,25 +900,28 @@ TYPED_TEST(HorizontalReconQuadraticTest, TestLsqCellSVD) { this->lacc, this->nblks_c, this->nlev, this->lsq_dim_unk, this->lsq_dim_c); + // Copy results back to host + Kokkos::deep_copy(p_coeff_h, this->p_coeff); + // Check result EXPECT_NEAR( - this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(0, 0, 0, 0))], - -0.56, 1e-6); + p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(0, 0, 0, 0))], + static_cast<TypeParam>(-0.56), 1e-6); EXPECT_NEAR( - this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(1, 0, 0, 0))], - 1.0, 1e-6); + p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(1, 0, 0, 0))], + static_cast<TypeParam>(1.0), 1e-6); EXPECT_NEAR( - this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(2, 0, 0, 0))], - 0.5, 1e-6); + p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(2, 0, 0, 0))], + static_cast<TypeParam>(0.5), 1e-6); EXPECT_NEAR( - this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(3, 0, 0, 0))], - 0.2, 1e-6); + p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(3, 0, 0, 0))], + static_cast<TypeParam>(0.2), 1e-6); EXPECT_NEAR( - this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(4, 0, 0, 0))], - 0.7, 1e-6); + p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(4, 0, 0, 0))], + static_cast<TypeParam>(0.7), 1e-6); EXPECT_NEAR( - this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(5, 0, 0, 0))], - 1.3, 1e-6); + p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(5, 0, 0, 0))], + static_cast<TypeParam>(1.3), 1e-6); } TYPED_TEST(HorizontalReconQuadraticTest, TestLsqCellSVDRandom) { @@ -708,37 +935,53 @@ TYPED_TEST(HorizontalReconQuadraticTest, TestLsqCellSVDRandom) { const auto &cell_neighbor_at = at<nproma, nblks_c, lsq_dim_c>; const auto &pseudoinv_at = at<nproma, lsq_dim_unk, lsq_dim_c, nblks_c>; const auto &p_coeff_at = at<lsq_dim_unk + 1, nproma, nlev, nblks_c>; - const auto &rmat_rdiag_at = at<nproma, lsq_dim_unk, nblks_c>; - const auto &rmat_utri_at = - at<nproma, (lsq_dim_unk * lsq_dim_unk - lsq_dim_unk) / 2, nblks_c>; const auto &moments_at = at<nproma, nblks_c, lsq_dim_unk>; + // Create host mirror views + auto p_cc_h = Kokkos::create_mirror_view(this->p_cc); + auto cell_neighbor_idx_h = + Kokkos::create_mirror_view(this->cell_neighbor_idx); + auto cell_neighbor_blk_h = + Kokkos::create_mirror_view(this->cell_neighbor_blk); + auto lsq_pseudoinv_h = Kokkos::create_mirror_view(this->lsq_pseudoinv); + auto lsq_moments_h = Kokkos::create_mirror_view(this->lsq_moments); + auto p_coeff_h = Kokkos::create_mirror_view(this->p_coeff); + + // Use fixed seed for reproducibility std::random_device rd; std::mt19937 gen(rd()); std::uniform_int_distribution<int> int_distrib(0, nproma - 1); std::uniform_real_distribution<TypeParam> real_distrib(0.0, 1.0); - // Initialization is done only for iblk = 0 and ilev = 0 + // Initialization for (int i = 0; i < nproma; ++i) { - this->p_cc[p_cc_at(i, 0, 0)] = real_distrib(gen); + p_cc_h[p_cc_at(i, 0, 0)] = real_distrib(gen); for (int j = 0; j < lsq_dim_c; ++j) { - this->cell_neighbor_idx[cell_neighbor_at(i, 0, j)] = int_distrib(gen); - this->cell_neighbor_blk[cell_neighbor_at(i, 0, j)] = 0; + cell_neighbor_idx_h[cell_neighbor_at(i, 0, j)] = int_distrib(gen); + cell_neighbor_blk_h[cell_neighbor_at(i, 0, j)] = 0; } for (int j = 0; j < lsq_dim_unk; ++j) { for (int k = 0; k < lsq_dim_c; ++k) { - this->lsq_pseudoinv[pseudoinv_at(i, j, k, 0)] = real_distrib(gen); + lsq_pseudoinv_h[pseudoinv_at(i, j, k, 0)] = real_distrib(gen); } - this->lsq_moments[moments_at(i, 0, j)] = real_distrib(gen); + lsq_moments_h[moments_at(i, 0, j)] = real_distrib(gen); } for (int j = 0; j < lsq_dim_unk + 1; ++j) { - this->p_coeff[p_coeff_at(j, i, 0, 0)] = real_distrib(gen); + p_coeff_h[p_coeff_at(j, i, 0, 0)] = static_cast<TypeParam>(0.0); } } + // Copy to device + Kokkos::deep_copy(this->p_cc, p_cc_h); + Kokkos::deep_copy(this->cell_neighbor_idx, cell_neighbor_idx_h); + Kokkos::deep_copy(this->cell_neighbor_blk, cell_neighbor_blk_h); + Kokkos::deep_copy(this->lsq_pseudoinv, lsq_pseudoinv_h); + Kokkos::deep_copy(this->lsq_moments, lsq_moments_h); + Kokkos::deep_copy(this->p_coeff, p_coeff_h); + // Test function recon_lsq_cell_q_svd<TypeParam>( this->p_cc.data(), this->cell_neighbor_idx.data(), @@ -749,46 +992,56 @@ TYPED_TEST(HorizontalReconQuadraticTest, TestLsqCellSVDRandom) { this->lacc, this->nblks_c, this->nlev, this->lsq_dim_unk, this->lsq_dim_c); + // Copy results back to host + Kokkos::deep_copy(p_coeff_h, this->p_coeff); + // Compute reference result std::vector<TypeParam> z_d(lsq_dim_c); - std::vector<TypeParam> z_qt_times_d(lsq_dim_unk); std::vector<TypeParam> p_result((lsq_dim_unk + 1) * nproma); + // calculating only for jb=0 for (int jb = this->i_startblk; jb < this->i_endblk; ++jb) { int i_startidx, i_endidx; get_indices_c_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb, this->i_startblk, this->i_endblk, i_startidx, i_endidx); + // calculating only for jk = 0 for (int jk = this->slev; jk < this->elev; ++jk) { - for (int jc = i_startidx; jc < i_endidx; ++jc) { + for (int jc = i_startidx; jc <= i_endidx; ++jc) { for (int i = 0; i < lsq_dim_c; ++i) { - z_d[i] = this->p_cc[p_cc_at( - this->cell_neighbor_idx[cell_neighbor_at(jc, jb, i)], jk, - this->cell_neighbor_blk[cell_neighbor_at(jc, jb, i)])] - - this->p_cc[p_cc_at(jc, jk, jb)]; + z_d[i] = p_cc_h[p_cc_at( + cell_neighbor_idx_h[cell_neighbor_at(jc, jb, i)], jk, + cell_neighbor_blk_h[cell_neighbor_at(jc, jb, i)])] - + p_cc_h[p_cc_at(jc, jk, jb)]; } for (int j = 1; j < lsq_dim_unk + 1; ++j) { p_result[at<lsq_dim_unk + 1, nproma>(j, jc)] = 0.0; for (int i = 0; i < lsq_dim_c; ++i) { p_result[at<lsq_dim_unk + 1, nproma>(j, jc)] += - this->lsq_pseudoinv[pseudoinv_at(jc, j - 1, i, jb)] * z_d[i]; + lsq_pseudoinv_h[pseudoinv_at(jc, j - 1, i, jb)] * z_d[i]; } + // p_result[at<lsq_dim_unk + 1, nproma>(j, jc)] *= + // lsq_moments_h[moments_at(jc, jb, j - 1)]; } p_result[at<lsq_dim_unk + 1, nproma>(0, jc)] = - this->p_cc[p_cc_at(jc, jk, jb)]; - for (int j = 0; j < lsq_dim_unk; ++j) { + p_cc_h[p_cc_at(jc, jk, jb)]; + for (int j = 0; j < lsq_dim_unk + 1; ++j) { p_result[at<lsq_dim_unk + 1, nproma>(0, jc)] -= + // p_result[at<lsq_dim_unk + 1, nproma>(j, jc)]; p_result[at<lsq_dim_unk + 1, nproma>(j + 1, jc)] * - this->lsq_moments[moments_at(jc, jb, j)]; + lsq_moments_h[moments_at(jc, jb, j)]; } } } } + TypeParam tol = std::is_same<TypeParam, float>::value ? + static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13); + // Check result - for (int j = 0; j < lsq_dim_unk + 1; ++j) { + for (int i = 0; i < lsq_dim_unk + 1; ++i) { for (int jc = 0; jc < nproma; ++jc) { - EXPECT_NEAR(this->p_coeff[(p_coeff_at(j, jc, 0, 0))], - p_result[(at<lsq_dim_unk + 1, nproma>(j, jc))], 1e-5) - << "For loop result fails for j = " << j << ", jc = " << jc; + EXPECT_NEAR(p_coeff_h[(p_coeff_at(i, jc, 0, 0))], + p_result[(at<lsq_dim_unk + 1, nproma>(i, jc))], tol) + << "For loop result fails for i = " << i << ", jc = " << jc; } } } @@ -811,51 +1064,73 @@ TYPED_TEST(HorizontalReconCubicTest, TestLsqCell) { at<nproma, (lsq_dim_unk * lsq_dim_unk - lsq_dim_unk) / 2, nblks_c>; const auto &moments_at = at<nproma, nblks_c, lsq_dim_unk>; + // Create host mirror views + auto p_cc_h = Kokkos::create_mirror_view(this->p_cc); + auto cell_neighbor_idx_h = + Kokkos::create_mirror_view(this->cell_neighbor_idx); + auto cell_neighbor_blk_h = + Kokkos::create_mirror_view(this->cell_neighbor_blk); + auto lsq_qtmat_c_h = Kokkos::create_mirror_view(this->lsq_qtmat_c); + auto lsq_rmat_rdiag_c_h = Kokkos::create_mirror_view(this->lsq_rmat_rdiag_c); + auto lsq_rmat_utri_c_h = Kokkos::create_mirror_view(this->lsq_rmat_utri_c); + auto lsq_moments_h = Kokkos::create_mirror_view(this->lsq_moments); + auto p_coeff_h = Kokkos::create_mirror_view(this->p_coeff); + // Initialization for (int i = 0; i < nproma; ++i) { - this->p_cc[p_cc_at(i, 0, 0)] = (i + 1); + p_cc_h[p_cc_at(i, 0, 0)] = static_cast<TypeParam>(i + 1); - this->cell_neighbor_idx[cell_neighbor_at(i, 0, 0)] = (i + 1) % nproma; - this->cell_neighbor_blk[cell_neighbor_at(i, 0, 0)] = 0; + cell_neighbor_idx_h[cell_neighbor_at(i, 0, 0)] = (i + 1) % nproma; + cell_neighbor_blk_h[cell_neighbor_at(i, 0, 0)] = 0; for (int j = 1; j < lsq_dim_c; ++j) { - this->cell_neighbor_idx[cell_neighbor_at(i, 0, j)] = i; - this->cell_neighbor_blk[cell_neighbor_at(i, 0, j)] = 0; + cell_neighbor_idx_h[cell_neighbor_at(i, 0, j)] = i; + cell_neighbor_blk_h[cell_neighbor_at(i, 0, j)] = 0; } for (int j = 0; j < lsq_dim_c; ++j) { - this->lsq_qtmat_c[qtmat_at(i, 0, j, 0)] = 1.0; - this->lsq_qtmat_c[qtmat_at(i, 1, j, 0)] = 0.9; - this->lsq_qtmat_c[qtmat_at(i, 2, j, 0)] = 0.8; - this->lsq_qtmat_c[qtmat_at(i, 3, j, 0)] = 0.7; - this->lsq_qtmat_c[qtmat_at(i, 4, j, 0)] = 0.6; - this->lsq_qtmat_c[qtmat_at(i, 5, j, 0)] = 0.5; - this->lsq_qtmat_c[qtmat_at(i, 6, j, 0)] = 0.4; - this->lsq_qtmat_c[qtmat_at(i, 7, j, 0)] = 0.3; - this->lsq_qtmat_c[qtmat_at(i, 8, j, 0)] = 0.2; + lsq_qtmat_c_h[qtmat_at(i, 0, j, 0)] = static_cast<TypeParam>(1.0); + lsq_qtmat_c_h[qtmat_at(i, 1, j, 0)] = static_cast<TypeParam>(0.9); + lsq_qtmat_c_h[qtmat_at(i, 2, j, 0)] = static_cast<TypeParam>(0.8); + lsq_qtmat_c_h[qtmat_at(i, 3, j, 0)] = static_cast<TypeParam>(0.7); + lsq_qtmat_c_h[qtmat_at(i, 4, j, 0)] = static_cast<TypeParam>(0.6); + lsq_qtmat_c_h[qtmat_at(i, 5, j, 0)] = static_cast<TypeParam>(0.5); + lsq_qtmat_c_h[qtmat_at(i, 6, j, 0)] = static_cast<TypeParam>(0.4); + lsq_qtmat_c_h[qtmat_at(i, 7, j, 0)] = static_cast<TypeParam>(0.3); + lsq_qtmat_c_h[qtmat_at(i, 8, j, 0)] = static_cast<TypeParam>(0.2); } for (int j = 0; j < lsq_dim_unk + 1; ++j) { - this->p_coeff[p_coeff_at(j, i, 0, 0)] = 0.0; + p_coeff_h[p_coeff_at(j, i, 0, 0)] = static_cast<TypeParam>(0.0); } for (int j = 0; j < lsq_dim_unk; ++j) { - this->lsq_rmat_rdiag_c[rmat_rdiag_at(i, j, 0)] = 2.0; + lsq_rmat_rdiag_c_h[rmat_rdiag_at(i, j, 0)] = static_cast<TypeParam>(2.0); } for (int j = 0; j < (lsq_dim_unk * lsq_dim_unk - lsq_dim_unk) / 2; ++j) { - this->lsq_rmat_utri_c[rmat_utri_at(i, j, 0)] = 1.0; + lsq_rmat_utri_c_h[rmat_utri_at(i, j, 0)] = static_cast<TypeParam>(1.0); } - this->lsq_moments[moments_at(i, 0, 0)] = 0.2; - this->lsq_moments[moments_at(i, 0, 1)] = 0.3; - this->lsq_moments[moments_at(i, 0, 2)] = 0.4; - this->lsq_moments[moments_at(i, 0, 3)] = 0.5; - this->lsq_moments[moments_at(i, 0, 4)] = 0.6; - this->lsq_moments[moments_at(i, 0, 5)] = 0.7; - this->lsq_moments[moments_at(i, 0, 6)] = 0.8; - this->lsq_moments[moments_at(i, 0, 7)] = 0.9; - this->lsq_moments[moments_at(i, 0, 8)] = 1.0; + lsq_moments_h[moments_at(i, 0, 0)] = static_cast<TypeParam>(0.2); + lsq_moments_h[moments_at(i, 0, 1)] = static_cast<TypeParam>(0.3); + lsq_moments_h[moments_at(i, 0, 2)] = static_cast<TypeParam>(0.4); + lsq_moments_h[moments_at(i, 0, 3)] = static_cast<TypeParam>(0.5); + lsq_moments_h[moments_at(i, 0, 4)] = static_cast<TypeParam>(0.6); + lsq_moments_h[moments_at(i, 0, 5)] = static_cast<TypeParam>(0.7); + lsq_moments_h[moments_at(i, 0, 6)] = static_cast<TypeParam>(0.8); + lsq_moments_h[moments_at(i, 0, 7)] = static_cast<TypeParam>(0.9); + lsq_moments_h[moments_at(i, 0, 8)] = static_cast<TypeParam>(1.0); } + // Copy to device + Kokkos::deep_copy(this->p_cc, p_cc_h); + Kokkos::deep_copy(this->cell_neighbor_idx, cell_neighbor_idx_h); + Kokkos::deep_copy(this->cell_neighbor_blk, cell_neighbor_blk_h); + Kokkos::deep_copy(this->lsq_qtmat_c, lsq_qtmat_c_h); + Kokkos::deep_copy(this->lsq_rmat_rdiag_c, lsq_rmat_rdiag_c_h); + Kokkos::deep_copy(this->lsq_rmat_utri_c, lsq_rmat_utri_c_h); + Kokkos::deep_copy(this->lsq_moments, lsq_moments_h); + Kokkos::deep_copy(this->p_coeff, p_coeff_h); + // Test function recon_lsq_cell_c<TypeParam>( this->p_cc.data(), this->cell_neighbor_idx.data(), @@ -867,37 +1142,40 @@ TYPED_TEST(HorizontalReconCubicTest, TestLsqCell) { this->lacc, this->nblks_c, this->nlev, this->lsq_dim_unk, this->lsq_dim_c); + // Copy results back to host + Kokkos::deep_copy(p_coeff_h, this->p_coeff); + // Check result EXPECT_NEAR( - this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(0, 0, 0, 0))], - 0.28, 1e-6); + p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(0, 0, 0, 0))], + static_cast<TypeParam>(0.28), 1e-6); EXPECT_NEAR( - this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(1, 0, 0, 0))], - 0.4, 1e-6); + p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(1, 0, 0, 0))], + static_cast<TypeParam>(0.4), 1e-6); EXPECT_NEAR( - this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(2, 0, 0, 0))], - -0.2, 1e-6); + p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(2, 0, 0, 0))], + static_cast<TypeParam>(-0.2), 1e-6); EXPECT_NEAR( - this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(3, 0, 0, 0))], - 0.4, 1e-6); + p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(3, 0, 0, 0))], + static_cast<TypeParam>(0.4), 1e-6); EXPECT_NEAR( - this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(4, 0, 0, 0))], - -0.2, 1e-6); + p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(4, 0, 0, 0))], + static_cast<TypeParam>(-0.2), 1e-6); EXPECT_NEAR( - this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(5, 0, 0, 0))], - 0.4, 1e-6); + p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(5, 0, 0, 0))], + static_cast<TypeParam>(0.4), 1e-6); EXPECT_NEAR( - this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(6, 0, 0, 0))], - -0.2, 1e-6); + p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(6, 0, 0, 0))], + static_cast<TypeParam>(-0.2), 1e-6); EXPECT_NEAR( - this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(7, 0, 0, 0))], - 0.4, 1e-6); + p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(7, 0, 0, 0))], + static_cast<TypeParam>(0.4), 1e-6); EXPECT_NEAR( - this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(8, 0, 0, 0))], - -0.2, 1e-6); + p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(8, 0, 0, 0))], + static_cast<TypeParam>(-0.2), 1e-6); EXPECT_NEAR( - this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(9, 0, 0, 0))], - 0.4, 1e-6); + p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(9, 0, 0, 0))], + static_cast<TypeParam>(0.4), 1e-6); } TYPED_TEST(HorizontalReconCubicTest, TestLsqCellRandom) { @@ -916,6 +1194,17 @@ TYPED_TEST(HorizontalReconCubicTest, TestLsqCellRandom) { at<nproma, (lsq_dim_unk * lsq_dim_unk - lsq_dim_unk) / 2, nblks_c>; const auto &moments_at = at<nproma, nblks_c, lsq_dim_unk>; + auto p_cc_h = Kokkos::create_mirror_view(this->p_cc); + auto cell_neighbor_idx_h = + Kokkos::create_mirror_view(this->cell_neighbor_idx); + auto cell_neighbor_blk_h = + Kokkos::create_mirror_view(this->cell_neighbor_blk); + auto lsq_qtmat_c_h = Kokkos::create_mirror_view(this->lsq_qtmat_c); + auto lsq_rmat_rdiag_c_h = Kokkos::create_mirror_view(this->lsq_rmat_rdiag_c); + auto lsq_rmat_utri_c_h = Kokkos::create_mirror_view(this->lsq_rmat_utri_c); + auto lsq_moments_h = Kokkos::create_mirror_view(this->lsq_moments); + auto p_coeff_h = Kokkos::create_mirror_view(this->p_coeff); + std::random_device rd; std::mt19937 gen(rd()); std::uniform_int_distribution<int> int_distrib(0, nproma - 1); @@ -923,29 +1212,39 @@ TYPED_TEST(HorizontalReconCubicTest, TestLsqCellRandom) { // Initialization for (int i = 0; i < nproma; ++i) { - this->p_cc[p_cc_at(i, 0, 0)] = real_distrib(gen); + p_cc_h[p_cc_at(i, 0, 0)] = real_distrib(gen); for (int j = 0; j < lsq_dim_c; ++j) { - this->cell_neighbor_idx[cell_neighbor_at(i, 0, j)] = int_distrib(gen); - this->cell_neighbor_blk[cell_neighbor_at(i, 0, j)] = 0; + cell_neighbor_idx_h[cell_neighbor_at(i, 0, j)] = int_distrib(gen); + cell_neighbor_blk_h[cell_neighbor_at(i, 0, j)] = 0; } for (int j = 0; j < lsq_dim_unk; ++j) { for (int k = 0; k < lsq_dim_c; ++k) { - this->lsq_qtmat_c[qtmat_at(i, j, k, 0)] = real_distrib(gen); + lsq_qtmat_c_h[qtmat_at(i, j, k, 0)] = real_distrib(gen); } - this->lsq_rmat_rdiag_c[rmat_rdiag_at(i, j, 0)] = real_distrib(gen); - this->lsq_moments[moments_at(i, 0, j)] = real_distrib(gen); + lsq_rmat_rdiag_c_h[rmat_rdiag_at(i, j, 0)] = real_distrib(gen); + lsq_moments_h[moments_at(i, 0, j)] = real_distrib(gen); } for (int j = 0; j < (lsq_dim_unk * lsq_dim_unk - lsq_dim_unk) / 2; ++j) { - this->lsq_rmat_utri_c[rmat_utri_at(i, j, 0)] = real_distrib(gen); + lsq_rmat_utri_c_h[rmat_utri_at(i, j, 0)] = real_distrib(gen); } for (int j = 0; j < lsq_dim_unk + 1; ++j) { - this->p_coeff[p_coeff_at(j, i, 0, 0)] = real_distrib(gen); + p_coeff_h[p_coeff_at(j, i, 0, 0)] = static_cast<TypeParam>(0.0); } } + // Copy to device + Kokkos::deep_copy(this->p_cc, p_cc_h); + Kokkos::deep_copy(this->cell_neighbor_idx, cell_neighbor_idx_h); + Kokkos::deep_copy(this->cell_neighbor_blk, cell_neighbor_blk_h); + Kokkos::deep_copy(this->lsq_qtmat_c, lsq_qtmat_c_h); + Kokkos::deep_copy(this->lsq_rmat_rdiag_c, lsq_rmat_rdiag_c_h); + Kokkos::deep_copy(this->lsq_rmat_utri_c, lsq_rmat_utri_c_h); + Kokkos::deep_copy(this->lsq_moments, lsq_moments_h); + Kokkos::deep_copy(this->p_coeff, p_coeff_h); + // Test function recon_lsq_cell_c<TypeParam>( this->p_cc.data(), this->cell_neighbor_idx.data(), @@ -957,27 +1256,31 @@ TYPED_TEST(HorizontalReconCubicTest, TestLsqCellRandom) { this->lacc, this->nblks_c, this->nlev, this->lsq_dim_unk, this->lsq_dim_c); + // copy results back to host + Kokkos::deep_copy(p_coeff_h, this->p_coeff); + // Compute reference result std::vector<TypeParam> z_d(lsq_dim_c); std::vector<TypeParam> z_qt_times_d(lsq_dim_unk); std::vector<TypeParam> p_result((lsq_dim_unk + 1) * nproma); + // calculating only for jb=0 for (int jb = this->i_startblk; jb < this->i_endblk; ++jb) { int i_startidx, i_endidx; get_indices_c_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb, this->i_startblk, this->i_endblk, i_startidx, i_endidx); + // calculating only for jk = 0 for (int jk = this->slev; jk < this->elev; ++jk) { - for (int jc = i_startidx; jc < i_endidx; ++jc) { + for (int jc = i_startidx; jc <= i_endidx; ++jc) { for (int i = 0; i < lsq_dim_c; ++i) { - z_d[i] = this->p_cc[p_cc_at( - this->cell_neighbor_idx[cell_neighbor_at(jc, jb, i)], jk, - this->cell_neighbor_blk[cell_neighbor_at(jc, jb, i)])] - - this->p_cc[p_cc_at(jc, jk, jb)]; + z_d[i] = p_cc_h[p_cc_at( + cell_neighbor_idx_h[cell_neighbor_at(jc, jb, i)], jk, + cell_neighbor_blk_h[cell_neighbor_at(jc, jb, i)])] - + p_cc_h[p_cc_at(jc, jk, jb)]; } for (int j = 0; j < lsq_dim_unk; ++j) { z_qt_times_d[j] = 0.0; for (int i = 0; i < lsq_dim_c; ++i) { - z_qt_times_d[j] += - this->lsq_qtmat_c[qtmat_at(jc, j, i, jb)] * z_d[i]; + z_qt_times_d[j] += lsq_qtmat_c_h[qtmat_at(jc, j, i, jb)] * z_d[i]; } } int utri_id = 0; @@ -985,28 +1288,31 @@ TYPED_TEST(HorizontalReconCubicTest, TestLsqCellRandom) { p_result[at<lsq_dim_unk + 1, nproma>(j, jc)] = z_qt_times_d[j - 1]; for (int k = j + 1; k <= lsq_dim_unk; ++k) { p_result[at<lsq_dim_unk + 1, nproma>(j, jc)] -= - this->lsq_rmat_utri_c[rmat_utri_at(jc, utri_id++, jb)] * + lsq_rmat_utri_c_h[rmat_utri_at(jc, utri_id++, jb)] * p_result[at<lsq_dim_unk + 1, nproma>(k, jc)]; } p_result[at<lsq_dim_unk + 1, nproma>(j, jc)] *= - this->lsq_rmat_rdiag_c[rmat_rdiag_at(jc, j - 1, jb)]; + lsq_rmat_rdiag_c_h[rmat_rdiag_at(jc, j - 1, jb)]; } p_result[at<lsq_dim_unk + 1, nproma>(0, jc)] = - this->p_cc[p_cc_at(jc, jk, jb)]; + p_cc_h[p_cc_at(jc, jk, jb)]; for (int j = 0; j < lsq_dim_unk; ++j) { p_result[at<lsq_dim_unk + 1, nproma>(0, jc)] -= p_result[at<lsq_dim_unk + 1, nproma>(j + 1, jc)] * - this->lsq_moments[moments_at(jc, jb, j)]; + lsq_moments_h[moments_at(jc, jb, j)]; } } } } + TypeParam tol = std::is_same<TypeParam, float>::value ? + static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13); + // Check result for (int i = 0; i < lsq_dim_unk + 1; ++i) { for (int jc = 0; jc < nproma; ++jc) { - EXPECT_NEAR(this->p_coeff[(p_coeff_at(i, jc, 0, 0))], - p_result[(at<lsq_dim_unk + 1, nproma>(i, jc))], 1e-5) + EXPECT_NEAR(p_coeff_h[(p_coeff_at(i, jc, 0, 0))], + p_result[(at<lsq_dim_unk + 1, nproma>(i, jc))], tol) << "For loop result fails for i = " << i << ", jc = " << jc; } } @@ -1025,43 +1331,61 @@ TYPED_TEST(HorizontalReconCubicTest, TestLsqCellSVD) { const auto &p_coeff_at = at<lsq_dim_unk + 1, nproma, nlev, nblks_c>; const auto &moments_at = at<nproma, nblks_c, lsq_dim_unk>; + // Create host mirror views + auto p_cc_h = Kokkos::create_mirror_view(this->p_cc); + auto cell_neighbor_idx_h = + Kokkos::create_mirror_view(this->cell_neighbor_idx); + auto cell_neighbor_blk_h = + Kokkos::create_mirror_view(this->cell_neighbor_blk); + auto lsq_pseudoinv_h = Kokkos::create_mirror_view(this->lsq_pseudoinv); + auto lsq_moments_h = Kokkos::create_mirror_view(this->lsq_moments); + auto p_coeff_h = Kokkos::create_mirror_view(this->p_coeff); + // Initialization for (int i = 0; i < nproma; ++i) { - this->p_cc[p_cc_at(i, 0, 0)] = (i + 1); + p_cc_h[p_cc_at(i, 0, 0)] = static_cast<TypeParam>(i + 1); - this->cell_neighbor_idx[cell_neighbor_at(i, 0, 0)] = (i + 1) % nproma; - this->cell_neighbor_blk[cell_neighbor_at(i, 0, 0)] = 0; + cell_neighbor_idx_h[cell_neighbor_at(i, 0, 0)] = (i + 1) % nproma; + cell_neighbor_blk_h[cell_neighbor_at(i, 0, 0)] = 0; for (int j = 1; j < lsq_dim_c; ++j) { - this->cell_neighbor_idx[cell_neighbor_at(i, 0, j)] = i; - this->cell_neighbor_blk[cell_neighbor_at(i, 0, j)] = 0; + cell_neighbor_idx_h[cell_neighbor_at(i, 0, j)] = i; + cell_neighbor_blk_h[cell_neighbor_at(i, 0, j)] = 0; } for (int j = 0; j < lsq_dim_c; ++j) { - this->lsq_pseudoinv[pseudoinv_at(i, 0, j, 0)] = 1.0; - this->lsq_pseudoinv[pseudoinv_at(i, 1, j, 0)] = 0.9; - this->lsq_pseudoinv[pseudoinv_at(i, 2, j, 0)] = 0.8; - this->lsq_pseudoinv[pseudoinv_at(i, 3, j, 0)] = 0.7; - this->lsq_pseudoinv[pseudoinv_at(i, 4, j, 0)] = 0.6; - this->lsq_pseudoinv[pseudoinv_at(i, 5, j, 0)] = 0.5; - this->lsq_pseudoinv[pseudoinv_at(i, 6, j, 0)] = 0.4; - this->lsq_pseudoinv[pseudoinv_at(i, 7, j, 0)] = 0.3; - this->lsq_pseudoinv[pseudoinv_at(i, 8, j, 0)] = 0.2; + lsq_pseudoinv_h[pseudoinv_at(i, 0, j, 0)] = static_cast<TypeParam>(1.0); + lsq_pseudoinv_h[pseudoinv_at(i, 1, j, 0)] = static_cast<TypeParam>(0.9); + lsq_pseudoinv_h[pseudoinv_at(i, 2, j, 0)] = static_cast<TypeParam>(0.8); + lsq_pseudoinv_h[pseudoinv_at(i, 3, j, 0)] = static_cast<TypeParam>(0.7); + lsq_pseudoinv_h[pseudoinv_at(i, 4, j, 0)] = static_cast<TypeParam>(0.6); + lsq_pseudoinv_h[pseudoinv_at(i, 5, j, 0)] = static_cast<TypeParam>(0.5); + lsq_pseudoinv_h[pseudoinv_at(i, 6, j, 0)] = static_cast<TypeParam>(0.4); + lsq_pseudoinv_h[pseudoinv_at(i, 7, j, 0)] = static_cast<TypeParam>(0.3); + lsq_pseudoinv_h[pseudoinv_at(i, 8, j, 0)] = static_cast<TypeParam>(0.2); } for (int j = 0; j < lsq_dim_unk + 1; ++j) { - this->p_coeff[p_coeff_at(j, i, 0, 0)] = 0.0; + p_coeff_h[p_coeff_at(j, i, 0, 0)] = static_cast<TypeParam>(0.0); } - this->lsq_moments[moments_at(i, 0, 0)] = 0.2; - this->lsq_moments[moments_at(i, 0, 1)] = 0.3; - this->lsq_moments[moments_at(i, 0, 2)] = 0.4; - this->lsq_moments[moments_at(i, 0, 3)] = 0.5; - this->lsq_moments[moments_at(i, 0, 4)] = 0.6; - this->lsq_moments[moments_at(i, 0, 5)] = 0.7; - this->lsq_moments[moments_at(i, 0, 6)] = 0.8; - this->lsq_moments[moments_at(i, 0, 7)] = 0.9; - this->lsq_moments[moments_at(i, 0, 8)] = 1.0; + lsq_moments_h[moments_at(i, 0, 0)] = static_cast<TypeParam>(0.2); + lsq_moments_h[moments_at(i, 0, 1)] = static_cast<TypeParam>(0.3); + lsq_moments_h[moments_at(i, 0, 2)] = static_cast<TypeParam>(0.4); + lsq_moments_h[moments_at(i, 0, 3)] = static_cast<TypeParam>(0.5); + lsq_moments_h[moments_at(i, 0, 4)] = static_cast<TypeParam>(0.6); + lsq_moments_h[moments_at(i, 0, 5)] = static_cast<TypeParam>(0.7); + lsq_moments_h[moments_at(i, 0, 6)] = static_cast<TypeParam>(0.8); + lsq_moments_h[moments_at(i, 0, 7)] = static_cast<TypeParam>(0.9); + lsq_moments_h[moments_at(i, 0, 8)] = static_cast<TypeParam>(1.0); } + // Copy to device + Kokkos::deep_copy(this->p_cc, p_cc_h); + Kokkos::deep_copy(this->cell_neighbor_idx, cell_neighbor_idx_h); + Kokkos::deep_copy(this->cell_neighbor_blk, cell_neighbor_blk_h); + Kokkos::deep_copy(this->lsq_pseudoinv, lsq_pseudoinv_h); + Kokkos::deep_copy(this->lsq_moments, lsq_moments_h); + Kokkos::deep_copy(this->p_coeff, p_coeff_h); + // Test function recon_lsq_cell_c_svd<TypeParam>( this->p_cc.data(), this->cell_neighbor_idx.data(), @@ -1072,37 +1396,40 @@ TYPED_TEST(HorizontalReconCubicTest, TestLsqCellSVD) { this->lacc, this->nblks_c, this->nlev, this->lsq_dim_unk, this->lsq_dim_c); + // Copy results back to host + Kokkos::deep_copy(p_coeff_h, this->p_coeff); + // Check result EXPECT_NEAR( - this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(0, 0, 0, 0))], - -1.64, 1e-6); + p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(0, 0, 0, 0))], + static_cast<TypeParam>(-1.64), 1e-6); EXPECT_NEAR( - this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(1, 0, 0, 0))], - 1.0, 1e-6); + p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(1, 0, 0, 0))], + static_cast<TypeParam>(1.0), 1e-6); EXPECT_NEAR( - this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(2, 0, 0, 0))], - 0.9, 1e-6); + p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(2, 0, 0, 0))], + static_cast<TypeParam>(0.9), 1e-6); EXPECT_NEAR( - this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(3, 0, 0, 0))], - 0.8, 1e-6); + p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(3, 0, 0, 0))], + static_cast<TypeParam>(0.8), 1e-6); EXPECT_NEAR( - this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(4, 0, 0, 0))], - 0.7, 1e-6); + p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(4, 0, 0, 0))], + static_cast<TypeParam>(0.7), 1e-6); EXPECT_NEAR( - this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(5, 0, 0, 0))], - 0.6, 1e-6); + p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(5, 0, 0, 0))], + static_cast<TypeParam>(0.6), 1e-6); EXPECT_NEAR( - this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(6, 0, 0, 0))], - 0.5, 1e-6); + p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(6, 0, 0, 0))], + static_cast<TypeParam>(0.5), 1e-6); EXPECT_NEAR( - this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(7, 0, 0, 0))], - 0.4, 1e-6); + p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(7, 0, 0, 0))], + static_cast<TypeParam>(0.4), 1e-6); EXPECT_NEAR( - this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(8, 0, 0, 0))], - 0.3, 1e-6); + p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(8, 0, 0, 0))], + static_cast<TypeParam>(0.3), 1e-6); EXPECT_NEAR( - this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(9, 0, 0, 0))], - 0.2, 1e-6); + p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(9, 0, 0, 0))], + static_cast<TypeParam>(0.2), 1e-6); } TYPED_TEST(HorizontalReconCubicTest, TestLsqCellSVDRandom) { @@ -1118,6 +1445,16 @@ TYPED_TEST(HorizontalReconCubicTest, TestLsqCellSVDRandom) { const auto &p_coeff_at = at<lsq_dim_unk + 1, nproma, nlev, nblks_c>; const auto &moments_at = at<nproma, nblks_c, lsq_dim_unk>; + // Create host mirror views + auto p_cc_h = Kokkos::create_mirror_view(this->p_cc); + auto cell_neighbor_idx_h = + Kokkos::create_mirror_view(this->cell_neighbor_idx); + auto cell_neighbor_blk_h = + Kokkos::create_mirror_view(this->cell_neighbor_blk); + auto lsq_pseudoinv_h = Kokkos::create_mirror_view(this->lsq_pseudoinv); + auto lsq_moments_h = Kokkos::create_mirror_view(this->lsq_moments); + auto p_coeff_h = Kokkos::create_mirror_view(this->p_coeff); + std::random_device rd; std::mt19937 gen(rd()); std::uniform_int_distribution<int> int_distrib(0, nproma - 1); @@ -1125,25 +1462,33 @@ TYPED_TEST(HorizontalReconCubicTest, TestLsqCellSVDRandom) { // Initialization for (int i = 0; i < nproma; ++i) { - this->p_cc[p_cc_at(i, 0, 0)] = real_distrib(gen); + p_cc_h[p_cc_at(i, 0, 0)] = real_distrib(gen); for (int j = 0; j < lsq_dim_c; ++j) { - this->cell_neighbor_idx[cell_neighbor_at(i, 0, j)] = int_distrib(gen); - this->cell_neighbor_blk[cell_neighbor_at(i, 0, j)] = 0; + cell_neighbor_idx_h[cell_neighbor_at(i, 0, j)] = int_distrib(gen); + cell_neighbor_blk_h[cell_neighbor_at(i, 0, j)] = 0; } for (int j = 0; j < lsq_dim_unk; ++j) { for (int k = 0; k < lsq_dim_c; ++k) { - this->lsq_pseudoinv[pseudoinv_at(i, j, k, 0)] = real_distrib(gen); + lsq_pseudoinv_h[pseudoinv_at(i, j, k, 0)] = real_distrib(gen); } - this->lsq_moments[moments_at(i, 0, j)] = real_distrib(gen); + lsq_moments_h[moments_at(i, 0, j)] = real_distrib(gen); } for (int j = 0; j < lsq_dim_unk + 1; ++j) { - this->p_coeff[p_coeff_at(j, i, 0, 0)] = real_distrib(gen); + p_coeff_h[p_coeff_at(j, i, 0, 0)] = static_cast<TypeParam>(0.0); } } + // Copy to device + Kokkos::deep_copy(this->p_cc, p_cc_h); + Kokkos::deep_copy(this->cell_neighbor_idx, cell_neighbor_idx_h); + Kokkos::deep_copy(this->cell_neighbor_blk, cell_neighbor_blk_h); + Kokkos::deep_copy(this->lsq_pseudoinv, lsq_pseudoinv_h); + Kokkos::deep_copy(this->lsq_moments, lsq_moments_h); + Kokkos::deep_copy(this->p_coeff, p_coeff_h); + // Test function recon_lsq_cell_c_svd<TypeParam>( this->p_cc.data(), this->cell_neighbor_idx.data(), @@ -1154,45 +1499,54 @@ TYPED_TEST(HorizontalReconCubicTest, TestLsqCellSVDRandom) { this->lacc, this->nblks_c, this->nlev, this->lsq_dim_unk, this->lsq_dim_c); + // Copy results back to host + Kokkos::deep_copy(p_coeff_h, this->p_coeff); + // Compute reference result std::vector<TypeParam> z_d(lsq_dim_c); std::vector<TypeParam> z_qt_times_d(lsq_dim_unk); std::vector<TypeParam> p_result((lsq_dim_unk + 1) * nproma); + // calculating only for jb=0 for (int jb = this->i_startblk; jb < this->i_endblk; ++jb) { int i_startidx, i_endidx; get_indices_c_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb, this->i_startblk, this->i_endblk, i_startidx, i_endidx); + // calculating only for jk = 0 for (int jk = this->slev; jk < this->elev; ++jk) { - for (int jc = i_startidx; jc < i_endidx; ++jc) { + for (int jc = i_startidx; jc <= i_endidx; ++jc) { for (int i = 0; i < lsq_dim_c; ++i) { - z_d[i] = this->p_cc[p_cc_at( - this->cell_neighbor_idx[cell_neighbor_at(jc, jb, i)], jk, - this->cell_neighbor_blk[cell_neighbor_at(jc, jb, i)])] - - this->p_cc[p_cc_at(jc, jk, jb)]; + z_d[i] = p_cc_h[p_cc_at( + cell_neighbor_idx_h[cell_neighbor_at(jc, jb, i)], jk, + cell_neighbor_blk_h[cell_neighbor_at(jc, jb, i)])] - + p_cc_h[p_cc_at(jc, jk, jb)]; } for (int j = 1; j < lsq_dim_unk + 1; ++j) { p_result[at<lsq_dim_unk + 1, nproma>(j, jc)] = 0.0; for (int i = 0; i < lsq_dim_c; ++i) { p_result[at<lsq_dim_unk + 1, nproma>(j, jc)] += - this->lsq_pseudoinv[pseudoinv_at(jc, j - 1, i, jb)] * z_d[i]; + lsq_pseudoinv_h[pseudoinv_at(jc, j - 1, i, jb)] * z_d[i]; } } p_result[at<lsq_dim_unk + 1, nproma>(0, jc)] = - this->p_cc[p_cc_at(jc, jk, jb)]; + p_cc_h[p_cc_at(jc, jk, jb)]; for (int j = 0; j < lsq_dim_unk; ++j) { p_result[at<lsq_dim_unk + 1, nproma>(0, jc)] -= p_result[at<lsq_dim_unk + 1, nproma>(j + 1, jc)] * - this->lsq_moments[moments_at(jc, jb, j)]; + lsq_moments_h[moments_at(jc, jb, j)]; } } } } + + TypeParam tol = std::is_same<TypeParam, float>::value ? + static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13); + // Check result for (int i = 0; i < lsq_dim_unk + 1; ++i) { for (int jc = 0; jc < nproma; ++jc) { - EXPECT_NEAR(this->p_coeff[(p_coeff_at(i, jc, 0, 0))], - p_result[(at<lsq_dim_unk + 1, nproma>(i, jc))], 1e-5) + EXPECT_NEAR(p_coeff_h[(p_coeff_at(i, jc, 0, 0))], + p_result[(at<lsq_dim_unk + 1, nproma>(i, jc))], tol) << "For loop result fails for i = " << i << ", jc = " << jc; } } diff --git a/test/c/test_horizontal_rot.cpp b/test/c/test_horizontal_rot.cpp index 68e80245f2fa4ea19f173f1f8ac095fda5505775..ca675a494302ccac8df0ba5f01417cc36808153c 100644 --- a/test/c/test_horizontal_rot.cpp +++ b/test/c/test_horizontal_rot.cpp @@ -14,8 +14,8 @@ #include <vector> #include <Kokkos_Core.hpp> -#include <gtest/gtest.h> #include <dim_helper.hpp> +#include <gtest/gtest.h> #include <horizontal/mo_lib_divrot.hpp> #include <support/mo_lib_loopindices.hpp> @@ -30,34 +30,36 @@ protected: static constexpr int dim4d = 2; // 4th dimension size int i_startblk = 0; - int i_endblk = nblks_v; // Test blocks [0 .. nblks_v-1] + int i_endblk = nblks_v - 1; // Test blocks [0 .. nblks_v-1] int i_startidx_in = 0; - int i_endidx_in = nproma; // Full range: 0 .. nproma-1 + int i_endidx_in = nproma - 1; // Full range: 0 .. nproma-1 std::vector<int> slev; std::vector<int> elev; bool lacc = false; // Not using ACC-specific behavior. bool acc_async = false; // Not using ACC-specific behavior. - std::vector<ValueType> vec_e; - std::vector<int> vert_edge_idx; - std::vector<int> vert_edge_blk; - std::vector<ValueType> geofac_rot; - std::vector<ValueType> rot_vec; - std::vector<ValueType> f4din; - std::vector<ValueType> f4dout; - - HorizontalRotVertexTest() { - slev.resize(dim4d, 0); - elev.resize(dim4d, nlev); // Full vertical range (0 .. nlev-1) - - vec_e.resize(dim_combine(nproma, nlev, nblks_e)); - vert_edge_idx.resize(dim_combine(nproma, nblks_v, 6)); - vert_edge_blk.resize(dim_combine(nproma, nblks_v, 6)); - geofac_rot.resize(dim_combine(nproma, 6, nblks_v)); - rot_vec.resize(dim_combine(nproma, nlev, nblks_v)); - f4din.resize(dim_combine(nproma, nlev, nblks_e, dim4d)); - f4dout.resize(dim_combine(nproma, nlev, nblks_v, dim4d)); - } + using exec_space = Kokkos::DefaultExecutionSpace; + using memory_space = exec_space::memory_space; + + // Views for the test data. All the data is assigned as one-dimensional arrays + Kokkos::View<ValueType *, memory_space> vec_e; + Kokkos::View<int *, memory_space> vert_edge_idx; + Kokkos::View<int *, memory_space> vert_edge_blk; + Kokkos::View<ValueType *, memory_space> geofac_rot; + Kokkos::View<ValueType *, memory_space> rot_vec; + Kokkos::View<ValueType *, memory_space> f4din; + Kokkos::View<ValueType *, memory_space> f4dout; + + HorizontalRotVertexTest() + : slev(dim4d, 0), + elev(dim4d, nlev - 1), // Full vertical range (0 .. nlev-1) + vec_e("vec_e", dim_combine(nproma, nlev, nblks_e)), + vert_edge_idx("vert_edge_idx", dim_combine(nproma, nblks_v, 6)), + vert_edge_blk("vert_edge_blk", dim_combine(nproma, nblks_v, 6)), + geofac_rot("geofac_rot", dim_combine(nproma, 6, nblks_v)), + rot_vec("rot_vec", dim_combine(nproma, nlev, nblks_v)), + f4din("f4din", dim_combine(nproma, nlev, nblks_e, dim4d)), + f4dout("f4dout", dim_combine(nproma, nlev, nblks_v, dim4d)) {} }; /// ValueTypes which the divrot tests should run with @@ -76,33 +78,46 @@ TYPED_TEST(HorizontalRotVertexTest, TestRotVertexAtmosSpecific) { const auto &geofac_rot_at = at<nproma, 6, nblks_v>; const auto &rot_vec_at = at<nproma, nlev, nblks_v>; + // Create host mirror views + auto vec_e_h = Kokkos::create_mirror_view(this->vec_e); + auto vert_edge_idx_h = Kokkos::create_mirror_view(this->vert_edge_idx); + auto vert_edge_blk_h = Kokkos::create_mirror_view(this->vert_edge_blk); + auto geofac_rot_h = Kokkos::create_mirror_view(this->geofac_rot); + auto rot_vec_h = Kokkos::create_mirror_view(this->rot_vec); + // Initialization with specific values for (int i = 0; i < nproma; ++i) { for (int k = 0; k < nlev; ++k) { - this->vec_e[vec_e_at(i, k, 0)] = (i + 1) * (k + 1); // Simple pattern + vec_e_h(vec_e_at(i, k, 0)) = (i + 1) * (k + 1); // Simple pattern } // Set edge indices to point to specific edges for (int j = 0; j < 6; ++j) { - this->vert_edge_idx[vert_edge_at(i, 0, j)] = (i + j) % nproma; + vert_edge_idx_h(vert_edge_at(i, 0, j)) = (i + j) % nproma; // All edges are in the same block for this test - this->vert_edge_blk[vert_edge_at(i, 0, j)] = 0; + vert_edge_blk_h(vert_edge_at(i, 0, j)) = 0; } - // Geometric factors for rotation - this->geofac_rot[geofac_rot_at(i, 0, 0)] = 0.3; - this->geofac_rot[geofac_rot_at(i, 1, 0)] = 0.2; - this->geofac_rot[geofac_rot_at(i, 2, 0)] = 0.1; - this->geofac_rot[geofac_rot_at(i, 3, 0)] = 0.2; - this->geofac_rot[geofac_rot_at(i, 4, 0)] = 0.1; - this->geofac_rot[geofac_rot_at(i, 5, 0)] = 0.1; + geofac_rot_h(geofac_rot_at(i, 0, 0)) = 0.3; + geofac_rot_h(geofac_rot_at(i, 1, 0)) = 0.2; + geofac_rot_h(geofac_rot_at(i, 2, 0)) = 0.1; + geofac_rot_h(geofac_rot_at(i, 3, 0)) = 0.2; + geofac_rot_h(geofac_rot_at(i, 4, 0)) = 0.1; + geofac_rot_h(geofac_rot_at(i, 5, 0)) = 0.1; // Initialize rot_vec to zero for (int k = 0; k < nlev; ++k) { - this->rot_vec[rot_vec_at(i, k, 0)] = 0.0; + rot_vec_h(rot_vec_at(i, k, 0)) = 0.0; } } + // Copy initialized data to device + Kokkos::deep_copy(this->vec_e, vec_e_h); + Kokkos::deep_copy(this->vert_edge_idx, vert_edge_idx_h); + Kokkos::deep_copy(this->vert_edge_blk, vert_edge_blk_h); + Kokkos::deep_copy(this->geofac_rot, geofac_rot_h); + Kokkos::deep_copy(this->rot_vec, rot_vec_h); + // Call the rot_vertex_atmos function rot_vertex_atmos<TypeParam>( this->vec_e.data(), this->vert_edge_idx.data(), @@ -111,13 +126,22 @@ TYPED_TEST(HorizontalRotVertexTest, TestRotVertexAtmosSpecific) { this->slev[0], this->elev[0], this->nproma, this->lacc, this->nlev, this->nblks_e, this->nblks_v); + // Copy results back to host for verification + Kokkos::deep_copy(rot_vec_h, this->rot_vec); + // Expected values based on the initialization pattern - EXPECT_NEAR(this->rot_vec[rot_vec_at(0, 0, 0)], 1.7, 1e-6); - EXPECT_NEAR(this->rot_vec[rot_vec_at(0, 1, 0)], 3.4, 1e-6); - EXPECT_NEAR(this->rot_vec[rot_vec_at(1, 0, 0)], 2.1, 1e-6); - EXPECT_NEAR(this->rot_vec[rot_vec_at(1, 1, 0)], 4.2, 1e-6); - EXPECT_NEAR(this->rot_vec[rot_vec_at(2, 0, 0)], 2.2, 1e-6); - EXPECT_NEAR(this->rot_vec[rot_vec_at(2, 1, 0)], 4.4, 1e-6); + EXPECT_NEAR(rot_vec_h[rot_vec_at(0, 0, 0)], static_cast<TypeParam>(1.7), + 1e-6); + EXPECT_NEAR(rot_vec_h[rot_vec_at(0, 1, 0)], static_cast<TypeParam>(3.4), + 1e-6); + EXPECT_NEAR(rot_vec_h[rot_vec_at(1, 0, 0)], static_cast<TypeParam>(2.1), + 1e-6); + EXPECT_NEAR(rot_vec_h[rot_vec_at(1, 1, 0)], static_cast<TypeParam>(4.2), + 1e-6); + EXPECT_NEAR(rot_vec_h[rot_vec_at(2, 0, 0)], static_cast<TypeParam>(2.2), + 1e-6); + EXPECT_NEAR(rot_vec_h[rot_vec_at(2, 1, 0)], static_cast<TypeParam>(4.4), + 1e-6); } TYPED_TEST(HorizontalRotVertexTest, TestRotVertexAtmosRandom) { @@ -131,36 +155,50 @@ TYPED_TEST(HorizontalRotVertexTest, TestRotVertexAtmosRandom) { const auto &geofac_rot_at = at<nproma, 6, nblks_v>; const auto &rot_vec_at = at<nproma, nlev, nblks_v>; + // Create host mirror views + auto vec_e_h = Kokkos::create_mirror_view(this->vec_e); + auto vert_edge_idx_h = Kokkos::create_mirror_view(this->vert_edge_idx); + auto vert_edge_blk_h = Kokkos::create_mirror_view(this->vert_edge_blk); + auto geofac_rot_h = Kokkos::create_mirror_view(this->geofac_rot); + auto rot_vec_h = Kokkos::create_mirror_view(this->rot_vec); + // Set up random number generators std::random_device rd; std::mt19937 gen(rd()); std::uniform_int_distribution<int> int_distrib(0, nproma - 1); - std::uniform_real_distribution<TypeParam> real_distrib(-10.0, 10.0); + std::uniform_real_distribution<TypeParam> real_distrib(-1.0, 1.0); // Initialization with random values for (int i = 0; i < nproma; ++i) { for (int k = 0; k < nlev; ++k) { - this->vec_e[vec_e_at(i, k, 0)] = real_distrib(gen); + vec_e_h(vec_e_at(i, k, 0)) = real_distrib(gen); } // Set random edge indices for (int j = 0; j < 6; ++j) { - this->vert_edge_idx[vert_edge_at(i, 0, j)] = int_distrib(gen); - this->vert_edge_blk[vert_edge_at(i, 0, j)] = + vert_edge_idx_h(vert_edge_at(i, 0, j)) = int_distrib(gen); + vert_edge_blk_h(vert_edge_at(i, 0, j)) = 0; // Keep in same block for simplicity } // Random geometric factors for (int j = 0; j < 6; ++j) { - this->geofac_rot[geofac_rot_at(i, j, 0)] = real_distrib(gen); + geofac_rot_h(geofac_rot_at(i, j, 0)) = real_distrib(gen); } - // Initialize rot_vec to random values + // Initialize rot_vec to zero for (int k = 0; k < nlev; ++k) { - this->rot_vec[rot_vec_at(i, k, 0)] = real_distrib(gen); + rot_vec_h(rot_vec_at(i, k, 0)) = static_cast<TypeParam>(0.0); } } + // Copy initialized data to device + Kokkos::deep_copy(this->vec_e, vec_e_h); + Kokkos::deep_copy(this->vert_edge_idx, vert_edge_idx_h); + Kokkos::deep_copy(this->vert_edge_blk, vert_edge_blk_h); + Kokkos::deep_copy(this->geofac_rot, geofac_rot_h); + Kokkos::deep_copy(this->rot_vec, rot_vec_h); + // Call the rot_vertex_atmos function rot_vertex_atmos<TypeParam>( this->vec_e.data(), this->vert_edge_idx.data(), @@ -169,50 +207,50 @@ TYPED_TEST(HorizontalRotVertexTest, TestRotVertexAtmosRandom) { this->slev[0], this->elev[0], this->nproma, this->lacc, this->nlev, this->nblks_e, this->nblks_v); + // Copy results back to host for verification + Kokkos::deep_copy(rot_vec_h, this->rot_vec); + // Calculate reference values separately and verify results std::vector<TypeParam> ref_rot_vec(nproma * nlev * nblks_v, 0.0); - for (int jb = this->i_startblk; jb < this->i_endblk; ++jb) { + for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) { int i_startidx, i_endidx; get_indices_v_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb, this->i_startblk, this->i_endblk, i_startidx, i_endidx); - for (int jk = this->slev[0]; jk < this->elev[0]; ++jk) { - for (int jv = i_startidx; jv < i_endidx; ++jv) { + for (int jk = this->slev[0]; jk <= this->elev[0]; ++jk) { + for (int jv = i_startidx; jv <= i_endidx; ++jv) { ref_rot_vec[rot_vec_at(jv, jk, jb)] = - this->vec_e[vec_e_at( - this->vert_edge_idx[vert_edge_at(jv, jb, 0)], jk, - this->vert_edge_blk[vert_edge_at(jv, jb, 0)])] * - this->geofac_rot[geofac_rot_at(jv, 0, jb)] + - this->vec_e[vec_e_at( - this->vert_edge_idx[vert_edge_at(jv, jb, 1)], jk, - this->vert_edge_blk[vert_edge_at(jv, jb, 1)])] * - this->geofac_rot[geofac_rot_at(jv, 1, jb)] + - this->vec_e[vec_e_at( - this->vert_edge_idx[vert_edge_at(jv, jb, 2)], jk, - this->vert_edge_blk[vert_edge_at(jv, jb, 2)])] * - this->geofac_rot[geofac_rot_at(jv, 2, jb)] + - this->vec_e[vec_e_at( - this->vert_edge_idx[vert_edge_at(jv, jb, 3)], jk, - this->vert_edge_blk[vert_edge_at(jv, jb, 3)])] * - this->geofac_rot[geofac_rot_at(jv, 3, jb)] + - this->vec_e[vec_e_at( - this->vert_edge_idx[vert_edge_at(jv, jb, 4)], jk, - this->vert_edge_blk[vert_edge_at(jv, jb, 4)])] * - this->geofac_rot[geofac_rot_at(jv, 4, jb)] + - this->vec_e[vec_e_at( - this->vert_edge_idx[vert_edge_at(jv, jb, 5)], jk, - this->vert_edge_blk[vert_edge_at(jv, jb, 5)])] * - this->geofac_rot[geofac_rot_at(jv, 5, jb)]; + vec_e_h[vec_e_at(vert_edge_idx_h[vert_edge_at(jv, jb, 0)], jk, + vert_edge_blk_h[vert_edge_at(jv, jb, 0)])] * + geofac_rot_h[geofac_rot_at(jv, 0, jb)] + + vec_e_h[vec_e_at(vert_edge_idx_h[vert_edge_at(jv, jb, 1)], jk, + vert_edge_blk_h[vert_edge_at(jv, jb, 1)])] * + geofac_rot_h[geofac_rot_at(jv, 1, jb)] + + vec_e_h[vec_e_at(vert_edge_idx_h[vert_edge_at(jv, jb, 2)], jk, + vert_edge_blk_h[vert_edge_at(jv, jb, 2)])] * + geofac_rot_h[geofac_rot_at(jv, 2, jb)] + + vec_e_h[vec_e_at(vert_edge_idx_h[vert_edge_at(jv, jb, 3)], jk, + vert_edge_blk_h[vert_edge_at(jv, jb, 3)])] * + geofac_rot_h[geofac_rot_at(jv, 3, jb)] + + vec_e_h[vec_e_at(vert_edge_idx_h[vert_edge_at(jv, jb, 4)], jk, + vert_edge_blk_h[vert_edge_at(jv, jb, 4)])] * + geofac_rot_h[geofac_rot_at(jv, 4, jb)] + + vec_e_h[vec_e_at(vert_edge_idx_h[vert_edge_at(jv, jb, 5)], jk, + vert_edge_blk_h[vert_edge_at(jv, jb, 5)])] * + geofac_rot_h[geofac_rot_at(jv, 5, jb)]; } } } + TypeParam tol = std::is_same<TypeParam, float>::value ? + static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13); + // Verify results for (int i = 0; i < nproma; ++i) { for (int k = 0; k < nlev; ++k) { - EXPECT_NEAR(this->rot_vec[rot_vec_at(i, k, 0)], - ref_rot_vec[rot_vec_at(i, k, 0)], 1e-5) + EXPECT_NEAR(rot_vec_h[rot_vec_at(i, k, 0)], + ref_rot_vec[rot_vec_at(i, k, 0)], tol) << "Results differ at i=" << i << ", k=" << k; } } @@ -231,33 +269,47 @@ TYPED_TEST(HorizontalRotVertexTest, TestRotVertexRISpecific) { const auto &geofac_rot_at = at<nproma, 6, nblks_v>; const auto &rot_vec_at = at<nproma, nlev, nblks_v>; + // Create host mirror views + auto vec_e_h = Kokkos::create_mirror_view(this->vec_e); + auto vert_edge_idx_h = Kokkos::create_mirror_view(this->vert_edge_idx); + auto vert_edge_blk_h = Kokkos::create_mirror_view(this->vert_edge_blk); + auto geofac_rot_h = Kokkos::create_mirror_view(this->geofac_rot); + auto rot_vec_h = Kokkos::create_mirror_view(this->rot_vec); + // Initialization with specific values for (int i = 0; i < nproma; ++i) { for (int k = 0; k < nlev; ++k) { - this->vec_e[vec_e_at(i, k, 0)] = (i + 1) * (k + 1); // Simple pattern + vec_e_h(vec_e_at(i, k, 0)) = (i + 1) * (k + 1); // Simple pattern } // Set edge indices to point to specific edges for (int j = 0; j < 6; ++j) { - this->vert_edge_idx[vert_edge_at(i, 0, j)] = (i + j) % nproma; + vert_edge_idx_h(vert_edge_at(i, 0, j)) = (i + j) % nproma; // All edges are in the same block for this test - this->vert_edge_blk[vert_edge_at(i, 0, j)] = 0; + vert_edge_blk_h(vert_edge_at(i, 0, j)) = 0; } // Geometric factors for rotation - this->geofac_rot[geofac_rot_at(i, 0, 0)] = 0.3; - this->geofac_rot[geofac_rot_at(i, 1, 0)] = 0.2; - this->geofac_rot[geofac_rot_at(i, 2, 0)] = 0.1; - this->geofac_rot[geofac_rot_at(i, 3, 0)] = 0.2; - this->geofac_rot[geofac_rot_at(i, 4, 0)] = 0.1; - this->geofac_rot[geofac_rot_at(i, 5, 0)] = 0.1; + geofac_rot_h(geofac_rot_at(i, 0, 0)) = 0.3; + geofac_rot_h(geofac_rot_at(i, 1, 0)) = 0.2; + geofac_rot_h(geofac_rot_at(i, 2, 0)) = 0.1; + geofac_rot_h(geofac_rot_at(i, 3, 0)) = 0.2; + geofac_rot_h(geofac_rot_at(i, 4, 0)) = 0.1; + geofac_rot_h(geofac_rot_at(i, 5, 0)) = 0.1; // Initialize rot_vec to zero for (int k = 0; k < nlev; ++k) { - this->rot_vec[rot_vec_at(i, k, 0)] = 0.0; + rot_vec_h(rot_vec_at(i, k, 0)) = 0.0; } } + // Copy initialized data to device + Kokkos::deep_copy(this->vec_e, vec_e_h); + Kokkos::deep_copy(this->vert_edge_idx, vert_edge_idx_h); + Kokkos::deep_copy(this->vert_edge_blk, vert_edge_blk_h); + Kokkos::deep_copy(this->geofac_rot, geofac_rot_h); + Kokkos::deep_copy(this->rot_vec, rot_vec_h); + // Call the rot_vertex_ri function rot_vertex_ri<TypeParam>( this->vec_e.data(), this->vert_edge_idx.data(), @@ -266,13 +318,22 @@ TYPED_TEST(HorizontalRotVertexTest, TestRotVertexRISpecific) { this->slev[0], this->elev[0], this->nproma, this->lacc, this->acc_async, this->nlev, this->nblks_e, this->nblks_v); + // Copy results back to host for verification + Kokkos::deep_copy(rot_vec_h, this->rot_vec); + // Expected values based on the initialization pattern - EXPECT_NEAR(this->rot_vec[rot_vec_at(0, 0, 0)], 1.7, 1e-6); - EXPECT_NEAR(this->rot_vec[rot_vec_at(0, 1, 0)], 3.4, 1e-6); - EXPECT_NEAR(this->rot_vec[rot_vec_at(1, 0, 0)], 2.1, 1e-6); - EXPECT_NEAR(this->rot_vec[rot_vec_at(1, 1, 0)], 4.2, 1e-6); - EXPECT_NEAR(this->rot_vec[rot_vec_at(2, 0, 0)], 2.2, 1e-6); - EXPECT_NEAR(this->rot_vec[rot_vec_at(2, 1, 0)], 4.4, 1e-6); + EXPECT_NEAR(rot_vec_h[rot_vec_at(0, 0, 0)], static_cast<TypeParam>(1.7), + 1e-6); + EXPECT_NEAR(rot_vec_h[rot_vec_at(0, 1, 0)], static_cast<TypeParam>(3.4), + 1e-6); + EXPECT_NEAR(rot_vec_h[rot_vec_at(1, 0, 0)], static_cast<TypeParam>(2.1), + 1e-6); + EXPECT_NEAR(rot_vec_h[rot_vec_at(1, 1, 0)], static_cast<TypeParam>(4.2), + 1e-6); + EXPECT_NEAR(rot_vec_h[rot_vec_at(2, 0, 0)], static_cast<TypeParam>(2.2), + 1e-6); + EXPECT_NEAR(rot_vec_h[rot_vec_at(2, 1, 0)], static_cast<TypeParam>(4.4), + 1e-6); } TYPED_TEST(HorizontalRotVertexTest, TestRotVertexRIRandom) { @@ -286,36 +347,50 @@ TYPED_TEST(HorizontalRotVertexTest, TestRotVertexRIRandom) { const auto &geofac_rot_at = at<nproma, 6, nblks_v>; const auto &rot_vec_at = at<nproma, nlev, nblks_v>; + // Create host mirror views + auto vec_e_h = Kokkos::create_mirror_view(this->vec_e); + auto vert_edge_idx_h = Kokkos::create_mirror_view(this->vert_edge_idx); + auto vert_edge_blk_h = Kokkos::create_mirror_view(this->vert_edge_blk); + auto geofac_rot_h = Kokkos::create_mirror_view(this->geofac_rot); + auto rot_vec_h = Kokkos::create_mirror_view(this->rot_vec); + // Set up random number generators std::random_device rd; std::mt19937 gen(rd()); std::uniform_int_distribution<int> int_distrib(0, nproma - 1); - std::uniform_real_distribution<TypeParam> real_distrib(-10.0, 10.0); + std::uniform_real_distribution<TypeParam> real_distrib(-1.0, 1.0); // Initialization with random values for (int i = 0; i < nproma; ++i) { for (int k = 0; k < nlev; ++k) { - this->vec_e[vec_e_at(i, k, 0)] = real_distrib(gen); + vec_e_h(vec_e_at(i, k, 0)) = real_distrib(gen); } // Set random edge indices for (int j = 0; j < 6; ++j) { - this->vert_edge_idx[vert_edge_at(i, 0, j)] = int_distrib(gen); - this->vert_edge_blk[vert_edge_at(i, 0, j)] = + vert_edge_idx_h(vert_edge_at(i, 0, j)) = int_distrib(gen); + vert_edge_blk_h(vert_edge_at(i, 0, j)) = 0; // Keep in same block for simplicity } // Random geometric factors for (int j = 0; j < 6; ++j) { - this->geofac_rot[geofac_rot_at(i, j, 0)] = real_distrib(gen); + geofac_rot_h(geofac_rot_at(i, j, 0)) = real_distrib(gen); } - // Initialize rot_vec to random values + // Initialize rot_vec to zero for (int k = 0; k < nlev; ++k) { - this->rot_vec[rot_vec_at(i, k, 0)] = real_distrib(gen); + rot_vec_h(rot_vec_at(i, k, 0)) = static_cast<TypeParam>(0.0); } } + // Copy initialized data to device + Kokkos::deep_copy(this->vec_e, vec_e_h); + Kokkos::deep_copy(this->vert_edge_idx, vert_edge_idx_h); + Kokkos::deep_copy(this->vert_edge_blk, vert_edge_blk_h); + Kokkos::deep_copy(this->geofac_rot, geofac_rot_h); + Kokkos::deep_copy(this->rot_vec, rot_vec_h); + // Call the rot_vertex_ri function rot_vertex_ri<TypeParam>( this->vec_e.data(), this->vert_edge_idx.data(), @@ -324,55 +399,51 @@ TYPED_TEST(HorizontalRotVertexTest, TestRotVertexRIRandom) { this->slev[0], this->elev[0], this->nproma, this->lacc, this->acc_async, this->nlev, this->nblks_e, this->nblks_v); - // Ensure computation is complete for both modes - Kokkos::fence(); + // Copy results back to host for verification + Kokkos::deep_copy(rot_vec_h, this->rot_vec); // Calculate reference values separately and verify results std::vector<TypeParam> ref_rot_vec(nproma * nlev * nblks_v, 0.0); - for (int jb = this->i_startblk; jb < this->i_endblk; ++jb) { + for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) { int i_startidx, i_endidx; get_indices_v_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb, this->i_startblk, this->i_endblk, i_startidx, i_endidx); - for (int jk = this->slev[0]; jk < this->elev[0]; ++jk) { - for (int jv = i_startidx; jv < i_endidx; ++jv) { + for (int jk = this->slev[0]; jk <= this->elev[0]; ++jk) { + for (int jv = i_startidx; jv <= i_endidx; ++jv) { ref_rot_vec[rot_vec_at(jv, jk, jb)] = - this->vec_e[vec_e_at( - this->vert_edge_idx[vert_edge_at(jv, jb, 0)], jk, - this->vert_edge_blk[vert_edge_at(jv, jb, 0)])] * - this->geofac_rot[geofac_rot_at(jv, 0, jb)] + - this->vec_e[vec_e_at( - this->vert_edge_idx[vert_edge_at(jv, jb, 1)], jk, - this->vert_edge_blk[vert_edge_at(jv, jb, 1)])] * - this->geofac_rot[geofac_rot_at(jv, 1, jb)] + - this->vec_e[vec_e_at( - this->vert_edge_idx[vert_edge_at(jv, jb, 2)], jk, - this->vert_edge_blk[vert_edge_at(jv, jb, 2)])] * - this->geofac_rot[geofac_rot_at(jv, 2, jb)] + - this->vec_e[vec_e_at( - this->vert_edge_idx[vert_edge_at(jv, jb, 3)], jk, - this->vert_edge_blk[vert_edge_at(jv, jb, 3)])] * - this->geofac_rot[geofac_rot_at(jv, 3, jb)] + - this->vec_e[vec_e_at( - this->vert_edge_idx[vert_edge_at(jv, jb, 4)], jk, - this->vert_edge_blk[vert_edge_at(jv, jb, 4)])] * - this->geofac_rot[geofac_rot_at(jv, 4, jb)] + - this->vec_e[vec_e_at( - this->vert_edge_idx[vert_edge_at(jv, jb, 5)], jk, - this->vert_edge_blk[vert_edge_at(jv, jb, 5)])] * - this->geofac_rot[geofac_rot_at(jv, 5, jb)]; + vec_e_h[vec_e_at(vert_edge_idx_h[vert_edge_at(jv, jb, 0)], jk, + vert_edge_blk_h[vert_edge_at(jv, jb, 0)])] * + geofac_rot_h[geofac_rot_at(jv, 0, jb)] + + vec_e_h[vec_e_at(vert_edge_idx_h[vert_edge_at(jv, jb, 1)], jk, + vert_edge_blk_h[vert_edge_at(jv, jb, 1)])] * + geofac_rot_h[geofac_rot_at(jv, 1, jb)] + + vec_e_h[vec_e_at(vert_edge_idx_h[vert_edge_at(jv, jb, 2)], jk, + vert_edge_blk_h[vert_edge_at(jv, jb, 2)])] * + geofac_rot_h[geofac_rot_at(jv, 2, jb)] + + vec_e_h[vec_e_at(vert_edge_idx_h[vert_edge_at(jv, jb, 3)], jk, + vert_edge_blk_h[vert_edge_at(jv, jb, 3)])] * + geofac_rot_h[geofac_rot_at(jv, 3, jb)] + + vec_e_h[vec_e_at(vert_edge_idx_h[vert_edge_at(jv, jb, 4)], jk, + vert_edge_blk_h[vert_edge_at(jv, jb, 4)])] * + geofac_rot_h[geofac_rot_at(jv, 4, jb)] + + vec_e_h[vec_e_at(vert_edge_idx_h[vert_edge_at(jv, jb, 5)], jk, + vert_edge_blk_h[vert_edge_at(jv, jb, 5)])] * + geofac_rot_h[geofac_rot_at(jv, 5, jb)]; } } } + TypeParam tol = std::is_same<TypeParam, float>::value ? + static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13); + // Verify results for (int i = 0; i < nproma; ++i) { for (int k = 0; k < nlev; ++k) { - EXPECT_NEAR(this->rot_vec[rot_vec_at(i, k, 0)], - ref_rot_vec[rot_vec_at(i, k, 0)], 1e-5) + EXPECT_NEAR(rot_vec_h[rot_vec_at(i, k, 0)], + ref_rot_vec[rot_vec_at(i, k, 0)], tol) << "Results differ at i=" << i << ", k=" << k << ")"; } } } - diff --git a/test/c/test_interpolation_scalar.cpp b/test/c/test_interpolation_scalar.cpp index 507ec3f01ae5e79b6053cd52fc0d1af41cc67f41..20ccf4f205cfc8f9945aa2ae69a8daf10cf5ea76 100644 --- a/test/c/test_interpolation_scalar.cpp +++ b/test/c/test_interpolation_scalar.cpp @@ -10,18 +10,12 @@ // --------------------------------------------------------------- #include "mo_lib_interpolation_scalar.hpp" +#include "mo_lib_loopindices.hpp" #include <Kokkos_Core.hpp> #include <gtest/gtest.h> #include <vector> - -// Free-function helpers for 3D and 4D array sizes (assumed column-major) -template <typename T> size_t num_elements_3d(int d1, int d2, int d3) { - return static_cast<size_t>(d1) * d2 * d3; -} - -template <typename T> size_t num_elements_4d(int d1, int d2, int d3, int d4) { - return static_cast<size_t>(d1) * d2 * d3 * d4; -} +#include <random> +#include "dim_helper.hpp" // Define a helper struct that holds the two types. template <typename InT, typename OutT> struct MixedPrecision { @@ -44,116 +38,99 @@ typedef ::testing::Types<MixedPrecision<double, double>, class interp_dimensions { public: // Constant dimensions. - static constexpr int nproma = 16; // inner loop length - static constexpr int nlev = 7; // number of vertical levels + static constexpr int nproma = 2; // inner loop length + static constexpr int nlev = 3; // number of vertical levels static constexpr int nblks_c = 2; // number of cell blocks static constexpr int nblks_e = 2; // number of edge blocks (for p_e_in) static constexpr int nblks_v = 2; // number of vertex blocks + static constexpr int cell_type = 6; + static constexpr int npromz_c = 2; + // Parameter values. const int i_startblk = 0; - const int i_endblk = 1; // Test blocks [0, 1] - const int i_startidx = 2; - const int i_endidx = nproma - 3; // Partial range: 2 .. nproma-3 - const int slev = 1; + const int i_endblk = nblks_c - 1; // Test blocks [0, 1] + const int i_startidx = 0; + const int i_endidx = nproma - 1; // Partial range: 2 .. nproma-3 + const int slev = 0; const int elev = nlev - 1; // Partial vertical range (1 .. nlev-1) const bool lacc = false; // Not using ACC-specific behavior. const bool acc_async = false; // No asynchronous execution. }; -template <typename T> -class InterpolationScalarTypedTestFixture : public ::testing::Test, +template <typename ValueType> +class InterpolationScalarSingleParamTest : public ::testing::Test, public interp_dimensions { -public: - // Arrays used for verts2edges - std::vector<T> p_vertex_in; // Dimensions: (nproma, nlev, nblks_v) - std::vector<int> edge_vertex_idx; // Dimensions: (nproma, nblks_e, 4) - std::vector<int> edge_vertex_blk; // Dimensions: (nproma, nblks_e, 4) - std::vector<T> coeff_int_edges; // Dimensions: (nproma, 2, nblks_e) - std::vector<T> p_edge_out; // Dimensions: (nproma, nlev, nblks_e) - - // Arrays used for edges2verts - std::vector<T> p_edge_in; // Dimensions: (nproma, nlev, nblks_e) - std::vector<int> edge_vert_idx; // Dimensions: (nproma, nblks_e, 6) - std::vector<int> edge_vert_blk; // Dimensions: (nproma, nblks_e, 6) - std::vector<T> v_int; // Dimensions: (nproma, 6, nblks_v) - std::vector<T> p_vert_out; // Dimensions: (nproma, nlev, nblks_v) - - // Arrays used for edges2cells - // std::vector<T> p_edge_in; // Dimensions: (nproma, nlev, nblks_e) - std::vector<int> edge_idx; // Dimensions: (nproma, nblks_c, 3) - std::vector<int> edge_blk; // Dimensions: (nproma, nblks_c, 3) - std::vector<T> coeff_int_cells; // Dimensions: (nproma, 3, nblks_c) - std::vector<T> p_cell_out; // Dimensions: (nproma, nlev, nblks_c) + protected: + + // Using Kokkos execution and memory spaces + using exec_space = Kokkos::DefaultExecutionSpace; + using memory_space = exec_space::memory_space; + + Kokkos::View<ValueType*, memory_space> p_vertex_in; + Kokkos::View<int*, memory_space> edge_vertex_idx; + Kokkos::View<int*, memory_space> edge_vertex_blk; + Kokkos::View<ValueType*, memory_space> coeff_int_edges; + Kokkos::View<ValueType*, memory_space> p_edge_out; + + // // Arrays used for edges2verts + Kokkos::View<ValueType*, memory_space> p_edge_in; + Kokkos::View<int*, memory_space> edge_vert_idx; + Kokkos::View<int*, memory_space> edge_vert_blk; + Kokkos::View<ValueType*, memory_space> v_int; + Kokkos::View<ValueType*, memory_space> p_vert_out; + + // // Arrays used for edges2cells + Kokkos::View<int*, memory_space> edge_idx; // Dimensions: (nproma, nblks_c, 3) + Kokkos::View<int*, memory_space> edge_blk; // Dimensions: (nproma, nblks_c, 3) + Kokkos::View<ValueType*, memory_space> coeff_int_cells; // Dimensions: (nproma, 3, nblks_c) + Kokkos::View<ValueType*, memory_space> p_cell_out; // Dimensions: (nproma, nlev, nblks_c) // Arrays used for verts2cells - std::vector<T> p_vert_in; // Dimensions: (nproma, nlev, nblks_v) - std::vector<int> cell_index_idx; // Dimensions: (nproma, nblks_c, 3) - std::vector<int> cell_index_blk; // Dimensions: (nproma, nblks_c, 3) + Kokkos::View<ValueType*, memory_space> p_vert_in; // Dimensions: (nproma, nlev, nblks_v) + Kokkos::View<int*, memory_space> cell_index_idx; // Dimensions: (nproma, nblks_c, 3) + Kokkos::View<int*, memory_space> cell_index_blk; // Dimensions: (nproma, nblks_c, 3) // Arrays used for avg_lib - std::vector<T> psi_c; // Dimensions: (nproma, nlev, nblks_c) - std::vector<int> cell_neighbor_idx; // Dimensions: (nproma, nblks_c, 3) - std::vector<int> cell_neighbor_blk; // Dimensions: (nproma, nblks_c, 3) - std::vector<T> avg_coeff; // Dimensions: (nproma, nlev, nblks_c) - std::vector<T> avg_psi_c; // Dimensions: (nproma, nlev, nblks_c) - - const int cell_type = 6; - const int npromz_c = 32; - - InterpolationScalarTypedTestFixture() { - // Allocate and initialize arrays needed for verts2edges - p_vertex_in.resize(num_elements_3d<T>(nproma, nlev, nblks_v), - static_cast<T>(1)); - edge_vertex_idx.resize(num_elements_3d<int>(nproma, nblks_e, 4), 1); - edge_vertex_blk.resize(num_elements_3d<int>(nproma, nblks_e, 4), 0); - coeff_int_edges.resize(num_elements_3d<T>(nproma, 2, nblks_e), - static_cast<T>(1)); - - p_edge_out.resize(num_elements_3d<T>(nproma, nlev, nblks_e), - static_cast<T>(0)); - - // Allocate & Initialize arrays needed for edges2verts - p_edge_in.resize(num_elements_3d<T>(nproma, nlev, nblks_e), - static_cast<T>(1)); - edge_vert_idx.resize(num_elements_3d<int>(nproma, nblks_e, 6), 1); - edge_vert_blk.resize(num_elements_3d<int>(nproma, nblks_e, 6), 0); - v_int.resize(num_elements_3d<T>(nproma, 6, nblks_v), static_cast<T>(1)); - - p_vert_out.resize(num_elements_3d<T>(nproma, nlev, nblks_v), - static_cast<T>(0)); - - // Allocate & Initialize arrays needed for edges2cells - edge_idx.resize(num_elements_3d<int>(nproma, nblks_c, 3), 1); - edge_blk.resize(num_elements_3d<int>(nproma, nblks_c, 3), 0); - coeff_int_cells.resize(num_elements_3d<T>(nproma, 3, nblks_c), - static_cast<T>(1)); - - p_cell_out.resize(num_elements_3d<T>(nproma, nlev, nblks_c), - static_cast<T>(0)); - - // Allocate and initialize arrays needed for verts2cells - p_vert_in.resize(num_elements_3d<T>(nproma, nlev, nblks_v), - static_cast<T>(1)); - cell_index_idx.resize(num_elements_3d<int>(nproma, nblks_c, 3), 1); - cell_index_blk.resize(num_elements_3d<int>(nproma, nblks_c, 3), 0); - - // Allocate and initialize arrays needed for avg_lib - psi_c.resize(num_elements_3d<T>(nproma, nlev, nblks_c), static_cast<T>(1)); - cell_neighbor_idx.resize(num_elements_3d<int>(nproma, nblks_c, 3), 1); - cell_neighbor_blk.resize(num_elements_3d<int>(nproma, nblks_c, 3), 0); - avg_coeff.resize(num_elements_3d<T>(nproma, nlev, nblks_c), - static_cast<T>(1)); - - // Allocate output arrays and initialize to zero. - avg_psi_c.resize(num_elements_3d<T>(nproma, nlev, nblks_c), - static_cast<T>(0)); - } + Kokkos::View<ValueType*, memory_space> psi_c; // Dimensions: (nproma, nlev, nblks_c) + Kokkos::View<int*, memory_space> cell_neighbor_idx; // Dimensions: (nproma, nblks_c, 3) + Kokkos::View<int*, memory_space> cell_neighbor_blk; // Dimensions: (nproma, nblks_c, 3) + Kokkos::View<ValueType*, memory_space> avg_coeff; // Dimensions: (nproma, 4, nblks_c) + Kokkos::View<ValueType*, memory_space> avg_psi_c; // Dimensions: (nproma, nlev, nblks_c) + + InterpolationScalarSingleParamTest() + : p_vertex_in("p_vertex_in", nproma * nlev * nblks_v), + edge_vertex_idx("edge_vertex_idx", nproma * nblks_e * 4), + edge_vertex_blk("edge_vertex_blk", nproma * nblks_e * 4), + coeff_int_edges("coeff_int_edges", nproma * 2 * nblks_e), + p_edge_out("p_edge_out", nproma * nlev * nblks_e), + + p_edge_in("p_edge_in", nproma * nlev * nblks_e), + edge_vert_idx("edge_vert_idx", nproma * nblks_e * 6), + edge_vert_blk("edge_vert_blk", nproma * nblks_e * 6), + v_int("v_int", nproma * 6 * nblks_v), + p_vert_out("p_vert_out", nproma * nlev * nblks_v), + + edge_idx("edge_idx", nproma * nblks_c * 3), + edge_blk("edge_blk", nproma * nblks_c * 3), + coeff_int_cells("coeff_int_cells", nproma * 3 * nblks_c), + p_cell_out("p_cell_out", nproma * nlev * nblks_c), + + p_vert_in("p_vert_in", nproma * nlev * nblks_v), + cell_index_idx("cell_index_idx", nproma * nblks_c * 3), + cell_index_blk("cell_index_blk", nproma * nblks_c * 3), + + psi_c("psi_c", nproma * nlev * nblks_c), + cell_neighbor_idx("cell_neighbor_idx", nproma * nblks_c * 3), + cell_neighbor_blk("cell_neighbor_blk", nproma * nblks_c * 3), + avg_coeff("avg_coeff", nproma * 4 * nblks_c), // 4 coefficients (self + 3 neighbors) + avg_psi_c("avg_psi_c", nproma * nlev * nblks_c) + {} }; typedef ::testing::Types<float, double> SingleType; -TYPED_TEST_SUITE(InterpolationScalarTypedTestFixture, SingleType); +TYPED_TEST_SUITE(InterpolationScalarSingleParamTest, SingleType); //////////////////////////////////////////////////////////////////////////////// // @@ -161,29 +138,203 @@ TYPED_TEST_SUITE(InterpolationScalarTypedTestFixture, SingleType); // //////////////////////////////////////////////////////////////////////////////// -TYPED_TEST(InterpolationScalarTypedTestFixture, Verts2Edges) { +TYPED_TEST(InterpolationScalarSingleParamTest, Verts2EdgesSpecific) { + + constexpr int nproma = this->nproma; + constexpr int nlev = this->nlev; + constexpr int nblks_v = this->nblks_v; + constexpr int nblks_e = this->nblks_e; + + // Define indexing helpers + const auto &vertex_at = at<nproma, nlev, nblks_v>; + const auto &idx_at = at<nproma, nblks_e, 4>; + const auto &blk_at = at<nproma, nblks_e, 4>; + const auto &coeff_at = at<nproma, 2, nblks_e>; + const auto &edge_at = at<nproma, nlev, nblks_e>; + + // Create host mirror views + auto p_vertex_in_h = Kokkos::create_mirror_view(this->p_vertex_in); + auto edge_vertex_idx_h = Kokkos::create_mirror_view(this->edge_vertex_idx); + auto edge_vertex_blk_h = Kokkos::create_mirror_view(this->edge_vertex_blk); + auto coeff_int_edges_h = Kokkos::create_mirror_view(this->coeff_int_edges); + auto p_edge_out_h = Kokkos::create_mirror_view(this->p_edge_out); + + // Initialize with specific test values + for (int ib = 0; ib < nblks_v; ++ib) { + for (int ik = 0; ik < nlev; ++ik) { + for (int ic = 0; ic < nproma; ++ic) { + p_vertex_in_h[vertex_at(ic, ik, ib)] = static_cast<TypeParam>(1.0 + ic + ik * 0.1 + ib * 0.01);; + } + } + } + + // Initialize edge connectivity indices with specific pattern + for (int ib = 0; ib < nblks_e; ++ib) { + for (int ic = 0; ic < nproma; ++ic) { + // Each edge connects to two specific vertices + edge_vertex_idx_h[idx_at(ic, ib, 0)] = ic % (nproma - 1); // First vertex index + edge_vertex_idx_h[idx_at(ic, ib, 1)] = (ic + 1) % nproma; // Second vertex index + edge_vertex_idx_h[idx_at(ic, ib, 2)] = 0; // Not used + edge_vertex_idx_h[idx_at(ic, ib, 3)] = 0; // Not used + + edge_vertex_blk_h[blk_at(ic, ib, 0)] = ib % nblks_v; // First vertex block + edge_vertex_blk_h[blk_at(ic, ib, 1)] = (ib + 1) % nblks_v; // Second vertex block + edge_vertex_blk_h[blk_at(ic, ib, 2)] = 0; // Not used + edge_vertex_blk_h[blk_at(ic, ib, 3)] = 0; // Not used + + coeff_int_edges_h[coeff_at(ic, 0, ib)] = static_cast<TypeParam>(0.5 + ic * 0.01); + coeff_int_edges_h[coeff_at(ic, 1, ib)] = static_cast<TypeParam>(0.5 + ib * 0.01); + + // Initialize output to zero and calculate expected results + for (int ik = 0; ik < nlev; ++ik) { + p_edge_out_h[edge_at(ic, ik, ib)] = static_cast<TypeParam>(0.0); + } + } + } + + // copy data to device + Kokkos::deep_copy(this->p_vertex_in, p_vertex_in_h); + Kokkos::deep_copy(this->edge_vertex_idx, edge_vertex_idx_h); + Kokkos::deep_copy(this->edge_vertex_blk, edge_vertex_blk_h); + Kokkos::deep_copy(this->coeff_int_edges, coeff_int_edges_h); + Kokkos::deep_copy(this->p_edge_out, p_edge_out_h); verts2edges_scalar_lib<TypeParam>( this->p_vertex_in.data(), this->edge_vertex_idx.data(), this->edge_vertex_blk.data(), this->coeff_int_edges.data(), this->p_edge_out.data(), this->i_startblk, this->i_endblk, - this->i_startidx, this->i_endidx, this->slev, this->elev, this->nproma, - this->nlev, this->nblks_v, this->nblks_e, this->lacc); - - // Check the outputs only for blocks in the range - // { [i_startblk, i_endblk], [slev,elev], [i_startidx, i_endidx] } - for (int block = this->i_startblk; block <= this->i_endblk; ++block) { - for (int level = this->slev; level < this->elev; ++level) { - for (int i = this->i_startidx; i < this->i_endidx; ++i) { - // Compute the linear index for a 3D array in column-major order: - size_t idx = - i + level * this->nproma + block * this->nproma * this->nlev; - // Since every contribution is 1 and there are 2 stencil points, - // expect 2. - EXPECT_NEAR(this->p_edge_out[idx], static_cast<TypeParam>(2), - static_cast<TypeParam>(1e-5)) - << "Failure at block " << block << ", level " << level << ", index " - << i; + this->i_startidx, this->i_endidx, this->slev, this->elev, nproma, + nlev, nblks_v, nblks_e, this->lacc); + + // Copy results back to host + Kokkos::deep_copy(p_edge_out_h, this->p_edge_out); + + // Expected results based on the specific test values + std::vector<TypeParam> expected_edges = { + 1.505, 1.015, 1.605, 1.116, 1.705, 1.217, + 1.525, 1.0251, 1.626, 1.1271, 1.727, 1.2291 + }; + + // Verify results + for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) { + for (int jk = this->slev; jk <= this->elev; ++jk) { + for (int jv = this->i_startidx; jv <= this->i_endidx; ++jv) { + EXPECT_NEAR(p_edge_out_h[edge_at(jv, jk, jb)], + expected_edges[edge_at(jv, jk, jb)], + static_cast<TypeParam>(1e-5)) + << "Failure at block " << jb << ", level " << jk << ", index " << jv; + } + } + } +} + +// Repeat the same test with randomized data +TYPED_TEST(InterpolationScalarSingleParamTest, Verts2EdgesRandom) { + + constexpr int nproma = this->nproma; + constexpr int nlev = this->nlev; + constexpr int nblks_v = this->nblks_v; + constexpr int nblks_e = this->nblks_e; + + // Define indexing helpers + const auto &vertex_at = at<nproma, nlev, nblks_v>; + const auto &idx_at = at<nproma, nblks_e, 4>; + const auto &blk_at = at<nproma, nblks_e, 4>; + const auto &coeff_at = at<nproma, 2, nblks_e>; + const auto &edge_at = at<nproma, nlev, nblks_e>; + + // Create host mirror views + auto p_vertex_in_h = Kokkos::create_mirror_view(this->p_vertex_in); + auto edge_vertex_idx_h = Kokkos::create_mirror_view(this->edge_vertex_idx); + auto edge_vertex_blk_h = Kokkos::create_mirror_view(this->edge_vertex_blk); + auto coeff_int_edges_h = Kokkos::create_mirror_view(this->coeff_int_edges); + auto p_edge_out_h = Kokkos::create_mirror_view(this->p_edge_out); + + // Use fixed seed for reproducibility + std::mt19937 gen(42); + std::uniform_int_distribution<int> edge_distrib(0, nproma - 1); + std::uniform_int_distribution<int> block_distrib(0, nblks_v - 1); + std::uniform_real_distribution<TypeParam> real_distrib(0.01, 1.0); + + // Initialize with random values + for (int ib = 0; ib < nblks_v; ++ib) { + for (int ik = 0; ik < nlev; ++ik) { + for (int ic = 0; ic < nproma; ++ic) { + p_vertex_in_h[vertex_at(ic, ik, ib)] = real_distrib(gen); + } + } + } + + // Initialize edge connectivity indices with random values + for (int ib = 0; ib < nblks_e; ++ib) { + for (int ic = 0; ic < nproma; ++ic) { + edge_vertex_idx_h[idx_at(ic, ib, 0)] = edge_distrib(gen); + edge_vertex_idx_h[idx_at(ic, ib, 1)] = edge_distrib(gen); + edge_vertex_idx_h[idx_at(ic, ib, 2)] = edge_distrib(gen); + edge_vertex_idx_h[idx_at(ic, ib, 3)] = edge_distrib(gen); + + edge_vertex_blk_h[blk_at(ic, ib, 0)] = block_distrib(gen); + edge_vertex_blk_h[blk_at(ic, ib, 1)] = block_distrib(gen); + edge_vertex_blk_h[blk_at(ic, ib, 2)] = block_distrib(gen); + edge_vertex_blk_h[blk_at(ic, ib, 3)] = block_distrib(gen); + + coeff_int_edges_h[coeff_at(ic, 0, ib)] = real_distrib(gen); + coeff_int_edges_h[coeff_at(ic, 1, ib)] = real_distrib(gen); + + // Initialize output to zero + for (int ik = 0; ik < nlev; ++ik) { + p_edge_out_h[edge_at(ic, ik, ib)] = static_cast<TypeParam>(0.0); + } + } + } + + // copy data to device + Kokkos::deep_copy(this->p_vertex_in, p_vertex_in_h); + Kokkos::deep_copy(this->edge_vertex_idx, edge_vertex_idx_h); + Kokkos::deep_copy(this->edge_vertex_blk, edge_vertex_blk_h); + Kokkos::deep_copy(this->coeff_int_edges, coeff_int_edges_h); + Kokkos::deep_copy(this->p_edge_out, p_edge_out_h); + + // Call the function + verts2edges_scalar_lib<TypeParam>( + this->p_vertex_in.data(), this->edge_vertex_idx.data(), + this->edge_vertex_blk.data(), this->coeff_int_edges.data(), + this->p_edge_out.data(), this->i_startblk, this->i_endblk, + this->i_startidx, this->i_endidx, this->slev, this->elev, nproma, + nlev, nblks_v, nblks_e, this->lacc); + + // Copy results back to host + Kokkos::deep_copy(p_edge_out_h, this->p_edge_out); + + // Prepare expected results storage + using host_space = Kokkos::HostSpace; + Kokkos::View<TypeParam***, host_space> expected_edges("expected_edges", nproma, nlev, nblks_e); + + for (int ib = this->i_startblk; ib <= this->i_endblk; ++ib) { + for (int ik = this->slev; ik <= this->elev; ++ik) { + for (int ic = this->i_startidx; ic <= this->i_endidx; ++ic) { + // Compute expected values + expected_edges(ic, ik, ib) = + coeff_int_edges_h[coeff_at(ic, 0, ib)] * + p_vertex_in_h[vertex_at(edge_vertex_idx_h[idx_at(ic, ib, 0)], ik, + edge_vertex_blk_h[blk_at(ic, ib, 0)])] + + coeff_int_edges_h[coeff_at(ic, 1, ib)] * + p_vertex_in_h[vertex_at(edge_vertex_idx_h[idx_at(ic, ib, 1)], ik, + edge_vertex_blk_h[blk_at(ic, ib, 1)])]; + } + } + } + + TypeParam tol = std::is_same<TypeParam, float>::value ? + static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13); + + // Verify results + for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) { + for (int jk = this->slev; jk <= this->elev; ++jk) { + for (int jv = this->i_startidx; jv <= this->i_endidx; ++jv) { + EXPECT_NEAR(p_edge_out_h[edge_at(jv, jk, jb)], + expected_edges(jv, jk, jb), tol) + << "Failure at block " << jb << ", level " << jk << ", index " << jv; } } } @@ -195,29 +346,196 @@ TYPED_TEST(InterpolationScalarTypedTestFixture, Verts2Edges) { // //////////////////////////////////////////////////////////////////////////////// -TYPED_TEST(InterpolationScalarTypedTestFixture, Edges2Verts) { +TYPED_TEST(InterpolationScalarSingleParamTest, Edges2VertsSpecific) { + constexpr int nproma = this->nproma; + constexpr int nlev = this->nlev; + constexpr int nblks_e = this->nblks_e; + constexpr int nblks_v = this->nblks_v; + + // Define indexing helpers + const auto &edge_at = at<nproma, nlev, nblks_e>; + const auto &idx_at = at<nproma, nblks_v, 6>; + const auto &blk_at = at<nproma, nblks_v, 6>; + const auto &coeff_at = at<nproma, 6, nblks_v>; + const auto &vert_at = at<nproma, nlev, nblks_v>; + + // Create host mirror views + auto p_edge_in_h = Kokkos::create_mirror_view(this->p_edge_in); + auto edge_vert_idx_h = Kokkos::create_mirror_view(this->edge_vert_idx); + auto edge_vert_blk_h = Kokkos::create_mirror_view(this->edge_vert_blk); + auto v_int_h = Kokkos::create_mirror_view(this->v_int); + auto p_vert_out_h = Kokkos::create_mirror_view(this->p_vert_out); + + // Initialize with index-based test values + for (int ib = 0; ib < nblks_e; ++ib) { + for (int ik = 0; ik < nlev; ++ik) { + for (int ic = 0; ic < nproma; ++ic) { + // Value depends on all three indices + p_edge_in_h[edge_at(ic, ik, ib)] = static_cast<TypeParam>(1.0 + ic + ik * 0.1 + ib * 0.01); + } + } + } + + // Initialize vertex connectivity indices with specific pattern + for (int ib = 0; ib < nblks_v; ++ib) { + for (int ic = 0; ic < nproma; ++ic) { + // Each vertex connects to 6 edges + for (int j = 0; j < 6; ++j) { + // Edge indices with a pattern + edge_vert_idx_h[idx_at(ic, ib, j)] = (ic + j) % nproma; + edge_vert_blk_h[blk_at(ic, ib, j)] = (ib + j % 2) % nblks_e; + + // Interpolation coefficients that depend on indices + v_int_h[coeff_at(ic, j, ib)] = static_cast<TypeParam>(1.0 / 6.0 + j * 0.01); +} + + // Initialize output to zero + for (int ik = 0; ik < nlev; ++ik) { + p_vert_out_h[vert_at(ic, ik, ib)] = static_cast<TypeParam>(0.0); + } + } + } + + // Copy to device + Kokkos::deep_copy(this->p_edge_in, p_edge_in_h); + Kokkos::deep_copy(this->edge_vert_idx, edge_vert_idx_h); + Kokkos::deep_copy(this->edge_vert_blk, edge_vert_blk_h); + Kokkos::deep_copy(this->v_int, v_int_h); + Kokkos::deep_copy(this->p_vert_out, p_vert_out_h); + // Call the function under test edges2verts_scalar_lib<TypeParam>( this->p_edge_in.data(), this->edge_vert_idx.data(), - this->edge_vert_blk.data(), this->v_int.data(), this->p_vert_out.data(), - this->i_startblk, this->i_endblk, this->i_startidx, this->i_endidx, - this->slev, this->elev, this->nproma, this->nlev, this->nblks_e, - this->nblks_v, this->lacc); - - // Check the outputs only for blocks in the range - // { [i_startblk, i_endblk], [slev,elev], [i_startidx_in, i_endidx_in] } - for (int block = this->i_startblk; block <= this->i_endblk; ++block) { - for (int level = this->slev; level < this->elev; ++level) { - for (int i = this->i_startidx; i < this->i_endidx; ++i) { - // Compute the linear index for a 3D array in column-major order: - size_t idx = - i + level * this->nproma + block * this->nproma * this->nlev; - // Since every contribution is 1 and there are 6 stencil points, - // expect 6. - EXPECT_NEAR(this->p_vert_out[idx], static_cast<TypeParam>(6), - static_cast<TypeParam>(1e-5)) - << "Failure at block " << block << ", level " << level << ", index " - << i; + this->edge_vert_blk.data(), this->v_int.data(), + this->p_vert_out.data(), this->i_startblk, this->i_endblk, + this->i_startidx, this->i_endidx, this->slev, this->elev, + nproma, nlev, nblks_e, nblks_v, this->lacc); + + // Copy results back to host + Kokkos::deep_copy(p_vert_out_h, this->p_vert_out); + + // Expected results based on the specific test values + std::vector<TypeParam> expected_verts = { + 1.7459, 1.7159, 1.8609, 1.8309, 1.9759, 1.9459, + 1.7456, 1.7156, 1.8606, 1.8306, 1.9756, 1.9456 + }; + + // Verify results + for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) { + for (int jk = this->slev; jk <= this->elev; ++jk) { + for (int jv = this->i_startidx; jv <= this->i_endidx; ++jv) { + EXPECT_NEAR(p_vert_out_h[vert_at(jv, jk, jb)], + expected_verts[vert_at(jv, jk, jb)], + static_cast<TypeParam>(1e-5)) + << "Failure at block " << jb << ", level " << jk << ", index " << jv; + } + } + } +} + +TYPED_TEST(InterpolationScalarSingleParamTest, Edges2VertsRandom) { + constexpr int nproma = this->nproma; + constexpr int nlev = this->nlev; + constexpr int nblks_e = this->nblks_e; + constexpr int nblks_v = this->nblks_v; + + // Define indexing helpers + const auto &edge_at = at<nproma, nlev, nblks_e>; + const auto &idx_at = at<nproma, nblks_v, 6>; + const auto &blk_at = at<nproma, nblks_v, 6>; + const auto &coeff_at = at<nproma, 6, nblks_v>; + const auto &vert_at = at<nproma, nlev, nblks_v>; + + // Create host mirror views + auto p_edge_in_h = Kokkos::create_mirror_view(this->p_edge_in); + auto edge_vert_idx_h = Kokkos::create_mirror_view(this->edge_vert_idx); + auto edge_vert_blk_h = Kokkos::create_mirror_view(this->edge_vert_blk); + auto v_int_h = Kokkos::create_mirror_view(this->v_int); + auto p_vert_out_h = Kokkos::create_mirror_view(this->p_vert_out); + + // Use fixed seed for reproducibility + std::mt19937 gen(42); + std::uniform_int_distribution<int> edge_distrib(0, nproma - 1); + std::uniform_int_distribution<int> block_distrib(0, nblks_e - 1); + std::uniform_real_distribution<TypeParam> real_distrib(0.01, 1.0); + + // Initialize with random values + for (int ib = 0; ib < nblks_e; ++ib) { + for (int ik = 0; ik < nlev; ++ik) { + for (int ic = 0; ic < nproma; ++ic) { + p_edge_in_h[edge_at(ic, ik, ib)] = real_distrib(gen); + } + } + } + + // Initialize vertex connectivity indices with random values + for (int ib = 0; ib < nblks_v; ++ib) { + for (int ic = 0; ic < nproma; ++ic) { + // Each vertex connects to 6 edges + for (int j = 0; j < 6; ++j) { + edge_vert_idx_h[idx_at(ic, ib, j)] = edge_distrib(gen); + edge_vert_blk_h[blk_at(ic, ib, j)] = block_distrib(gen); + + // Random interpolation coefficients + v_int_h[coeff_at(ic, j, ib)] = real_distrib(gen) / 6.0; // Scaled to ensure reasonable sums + } + + // Initialize output to zero + for (int ik = 0; ik < nlev; ++ik) { + p_vert_out_h[vert_at(ic, ik, ib)] = static_cast<TypeParam>(0.0); + } + } + } + + // Copy to device + Kokkos::deep_copy(this->p_edge_in, p_edge_in_h); + Kokkos::deep_copy(this->edge_vert_idx, edge_vert_idx_h); + Kokkos::deep_copy(this->edge_vert_blk, edge_vert_blk_h); + Kokkos::deep_copy(this->v_int, v_int_h); + Kokkos::deep_copy(this->p_vert_out, p_vert_out_h); + + // Call the function under test + edges2verts_scalar_lib<TypeParam>( + this->p_edge_in.data(), this->edge_vert_idx.data(), + this->edge_vert_blk.data(), this->v_int.data(), + this->p_vert_out.data(), this->i_startblk, this->i_endblk, + this->i_startidx, this->i_endidx, this->slev, this->elev, + nproma, nlev, nblks_e, nblks_v, this->lacc); + + // Copy results back to host + Kokkos::deep_copy(p_vert_out_h, this->p_vert_out); + + // Prepare expected results storage + using host_space = Kokkos::HostSpace; + Kokkos::View<TypeParam***, host_space> expected_verts("expected_verts", nproma, nlev, nblks_v); + + // Compute expected values + for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) { + for (int jk = this->slev; jk <= this->elev; ++jk) { + for (int jv = this->i_startidx; jv <= this->i_endidx; ++jv) { + expected_verts(jv, jk, jb) = 0.0; + + for (int j = 0; j < 6; ++j) { + int edge_idx = edge_vert_idx_h[idx_at(jv, jb, j)]; + int edge_blk = edge_vert_blk_h[blk_at(jv, jb, j)]; + TypeParam coeff = v_int_h[coeff_at(jv, j, jb)]; + + expected_verts(jv, jk, jb) += coeff * p_edge_in_h[edge_at(edge_idx, jk, edge_blk)]; + } + } + } + } + + TypeParam tol = std::is_same<TypeParam, float>::value ? + static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13); + + // Verify results + for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) { + for (int jk = this->slev; jk <= this->elev; ++jk) { + for (int jv = this->i_startidx; jv <= this->i_endidx; ++jv) { + EXPECT_NEAR(p_vert_out_h[vert_at(jv, jk, jb)], + expected_verts(jv, jk, jb), tol) + << "Failure at block " << jb << ", level " << jk << ", index " << jv; } } } @@ -229,55 +547,403 @@ TYPED_TEST(InterpolationScalarTypedTestFixture, Edges2Verts) { // //////////////////////////////////////////////////////////////////////////////// -TYPED_TEST(InterpolationScalarTypedTestFixture, Edges2Cells) { +TYPED_TEST(InterpolationScalarSingleParamTest, Edges2CellsSpecific) { + constexpr int nproma = this->nproma; + constexpr int nlev = this->nlev; + constexpr int nblks_e = this->nblks_e; + constexpr int nblks_c = this->nblks_c; + + // Define indexing helpers + const auto &edge_at = at<nproma, nlev, nblks_e>; + const auto &idx_at = at<nproma, nblks_c, 3>; + const auto &blk_at = at<nproma, nblks_c, 3>; + const auto &coeff_at = at<nproma, 3, nblks_c>; + const auto &cell_at = at<nproma, nlev, nblks_c>; + + // Create host mirror views + auto p_edge_in_h = Kokkos::create_mirror_view(this->p_edge_in); + auto edge_idx_h = Kokkos::create_mirror_view(this->edge_idx); + auto edge_blk_h = Kokkos::create_mirror_view(this->edge_blk); + auto coeff_int_cells_h = Kokkos::create_mirror_view(this->coeff_int_cells); + auto p_cell_out_h = Kokkos::create_mirror_view(this->p_cell_out); + + // Initialize with index-based test values + for (int ib = 0; ib < nblks_e; ++ib) { + for (int ik = 0; ik < nlev; ++ik) { + for (int ic = 0; ic < nproma; ++ic) { + // Value depends on all three indices + p_edge_in_h[edge_at(ic, ik, ib)] = static_cast<TypeParam>(1.0 + ic + ik * 0.1 + ib * 0.01); + } + } + } + // Initialize cell connectivity indices with specific pattern + for (int ib = 0; ib < nblks_c; ++ib) { + for (int ic = 0; ic < nproma; ++ic) { + // Each cell connects to 3 edges + for (int j = 0; j < 3; ++j) { + // Edge indices with a pattern + edge_idx_h[idx_at(ic, ib, j)] = (ic + j) % nproma; + edge_blk_h[blk_at(ic, ib, j)] = (ib + j % 2) % nblks_e; + + // Interpolation coefficients that depend on indices + coeff_int_cells_h[coeff_at(ic, j, ib)] = static_cast<TypeParam>(1.0 / 3.0 + j * 0.01); + } + + // Initialize output to zero + for (int ik = 0; ik < nlev; ++ik) { + p_cell_out_h[cell_at(ic, ik, ib)] = static_cast<TypeParam>(0.0); + } + } + } + + // Copy to device + Kokkos::deep_copy(this->p_edge_in, p_edge_in_h); + Kokkos::deep_copy(this->edge_idx, edge_idx_h); + Kokkos::deep_copy(this->edge_blk, edge_blk_h); + Kokkos::deep_copy(this->coeff_int_cells, coeff_int_cells_h); + Kokkos::deep_copy(this->p_cell_out, p_cell_out_h); + + // Call the function under test edges2cells_scalar_lib<TypeParam>( - this->p_edge_in.data(), this->edge_idx.data(), this->edge_blk.data(), - this->coeff_int_cells.data(), this->p_cell_out.data(), this->i_startblk, - this->i_endblk, this->i_startidx, this->i_endidx, this->slev, this->elev, - this->nproma, this->nlev, this->nblks_e, this->nblks_c, this->lacc); - - // Check the outputs only for blocks in the range - // { [i_startblk, i_endblk], [slev,elev], [i_startidx_in, i_endidx_in] } - for (int block = this->i_startblk; block <= this->i_endblk; ++block) { - for (int level = this->slev; level < this->elev; ++level) { - for (int i = this->i_startidx; i < this->i_endidx; ++i) { - // Compute the linear index for a 3D array in column-major order: - size_t idx = - i + level * this->nproma + block * this->nproma * this->nlev; - // Since every contribution is 1 and there are 3 stencil points, - // expect 3. - EXPECT_NEAR(this->p_cell_out[idx], static_cast<TypeParam>(3), + this->p_edge_in.data(), this->edge_idx.data(), + this->edge_blk.data(), this->coeff_int_cells.data(), + this->p_cell_out.data(), this->i_startblk, this->i_endblk, + this->i_startidx, this->i_endidx, this->slev, this->elev, + nproma, nlev, nblks_e, nblks_c, this->lacc); + + // Copy results back to host + Kokkos::deep_copy(p_cell_out_h, this->p_cell_out); + + // Expected results based on the specific test values + std::vector<TypeParam> expected_cells = { + 1.37677, 1.7201, 1.47977, 1.8231, 1.58277, 1.9261, + 1.3802, 1.72353, 1.4832, 1.82653, 1.5862, 1.92953 + }; + + // Verify results + for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) { + for (int jk = this->slev; jk <= this->elev; ++jk) { + for (int jc = this->i_startidx; jc <= this->i_endidx; ++jc) { + EXPECT_NEAR(p_cell_out_h[cell_at(jc, jk, jb)], + expected_cells[cell_at(jc, jk, jb)], static_cast<TypeParam>(1e-5)) - << "Failure at block " << block << ", level " << level << ", index " - << i; + << "Failure at block " << jb << ", level " << jk << ", index " << jc; } } } } -TYPED_TEST(InterpolationScalarTypedTestFixture, Verts2Cells) { +TYPED_TEST(InterpolationScalarSingleParamTest, Edges2CellsRandom) { + constexpr int nproma = this->nproma; + constexpr int nlev = this->nlev; + constexpr int nblks_e = this->nblks_e; + constexpr int nblks_c = this->nblks_c; + + // Define indexing helpers + const auto &edge_at = at<nproma, nlev, nblks_e>; + const auto &idx_at = at<nproma, nblks_c, 3>; + const auto &blk_at = at<nproma, nblks_c, 3>; + const auto &coeff_at = at<nproma, 3, nblks_c>; + const auto &cell_at = at<nproma, nlev, nblks_c>; + + // Create host mirror views + auto p_edge_in_h = Kokkos::create_mirror_view(this->p_edge_in); + auto edge_idx_h = Kokkos::create_mirror_view(this->edge_idx); + auto edge_blk_h = Kokkos::create_mirror_view(this->edge_blk); + auto coeff_int_cells_h = Kokkos::create_mirror_view(this->coeff_int_cells); + auto p_cell_out_h = Kokkos::create_mirror_view(this->p_cell_out); + + // Use fixed seed for reproducibility + std::mt19937 gen(42); + std::uniform_int_distribution<int> edge_distrib(0, nproma - 1); + std::uniform_int_distribution<int> block_distrib(0, nblks_e - 1); + std::uniform_real_distribution<TypeParam> real_distrib(0.01, 1.0); + + // Initialize with random values + for (int ib = 0; ib < nblks_e; ++ib) { + for (int ik = 0; ik < nlev; ++ik) { + for (int ic = 0; ic < nproma; ++ic) { + p_edge_in_h[edge_at(ic, ik, ib)] = real_distrib(gen); + } + } + } + + // Initialize cell connectivity indices with random values + for (int ib = 0; ib < nblks_c; ++ib) { + for (int ic = 0; ic < nproma; ++ic) { + // Each cell connects to 3 edges + for (int j = 0; j < 3; ++j) { + edge_idx_h[idx_at(ic, ib, j)] = edge_distrib(gen); + edge_blk_h[blk_at(ic, ib, j)] = block_distrib(gen); + + // Random interpolation coefficients + coeff_int_cells_h[coeff_at(ic, j, ib)] = real_distrib(gen) / 3.0; // Scaled to ensure reasonable sums + } + + // Initialize output to zero + for (int ik = 0; ik < nlev; ++ik) { + p_cell_out_h[cell_at(ic, ik, ib)] = static_cast<TypeParam>(0.0); + } + } + } + + // Copy to device + Kokkos::deep_copy(this->p_edge_in, p_edge_in_h); + Kokkos::deep_copy(this->edge_idx, edge_idx_h); + Kokkos::deep_copy(this->edge_blk, edge_blk_h); + Kokkos::deep_copy(this->coeff_int_cells, coeff_int_cells_h); + Kokkos::deep_copy(this->p_cell_out, p_cell_out_h); + + // Call the function under test + edges2cells_scalar_lib<TypeParam>( + this->p_edge_in.data(), this->edge_idx.data(), + this->edge_blk.data(), this->coeff_int_cells.data(), + this->p_cell_out.data(), this->i_startblk, this->i_endblk, + this->i_startidx, this->i_endidx, this->slev, this->elev, + nproma, nlev, nblks_e, nblks_c, this->lacc); + + // Copy results back to host + Kokkos::deep_copy(p_cell_out_h, this->p_cell_out); + + // Prepare expected results storage + using host_space = Kokkos::HostSpace; + Kokkos::View<TypeParam***, host_space> expected_cells("expected_cells", nproma, nlev, nblks_c); + + // Compute expected values + for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) { + for (int jk = this->slev; jk <= this->elev; ++jk) { + for (int jc = this->i_startidx; jc <= this->i_endidx; ++jc) { + expected_cells(jc, jk, jb) = 0.0; + + for (int j = 0; j < 3; ++j) { + int edge_index = edge_idx_h[idx_at(jc, jb, j)]; + int edge_block = edge_blk_h[blk_at(jc, jb, j)]; + TypeParam coeff = coeff_int_cells_h[coeff_at(jc, j, jb)]; + + expected_cells(jc, jk, jb) += coeff * p_edge_in_h[edge_at(edge_index, jk, edge_block)]; + } + } + } + } + TypeParam tol = std::is_same<TypeParam, float>::value ? + static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13); + + // Verify results + for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) { + for (int jk = this->slev; jk <= this->elev; ++jk) { + for (int jc = this->i_startidx; jc <= this->i_endidx; ++jc) { + EXPECT_NEAR(p_cell_out_h[cell_at(jc, jk, jb)], + expected_cells(jc, jk, jb), tol) + << "Failure at block " << jb << ", level " << jk << ", index " << jc; + } + } + } +} + +//////////////////////////////////////////////////////////////////////////////// +// +// ! verts2cells +// +//////////////////////////////////////////////////////////////////////////////// + +TYPED_TEST(InterpolationScalarSingleParamTest, Verts2CellsSpecific) { + constexpr int nproma = this->nproma; + constexpr int nlev = this->nlev; + constexpr int nblks_v = this->nblks_v; + constexpr int nblks_c = this->nblks_c; + constexpr int npromz_c = this->npromz_c; + + // Define indexing helpers + const auto &vert_at = at<nproma, nlev, nblks_v>; + const auto &idx_at = at<nproma, nblks_c, 3>; + const auto &blk_at = at<nproma, nblks_c, 3>; + const auto &coeff_at = at<nproma, 3, nblks_c>; + const auto &cell_at = at<nproma, nlev, nblks_c>; + + // Create host mirror views + auto p_vert_in_h = Kokkos::create_mirror_view(this->p_vert_in); + auto cell_index_idx_h = Kokkos::create_mirror_view(this->cell_index_idx); + auto cell_index_blk_h = Kokkos::create_mirror_view(this->cell_index_blk); + auto coeff_int_cells_h = Kokkos::create_mirror_view(this->coeff_int_cells); + auto p_cell_out_h = Kokkos::create_mirror_view(this->p_cell_out); + + // Initialize with index-based test values + for (int ib = 0; ib < nblks_v; ++ib) { + for (int ik = 0; ik < nlev; ++ik) { + for (int ic = 0; ic < nproma; ++ic) { + // Value depends on all three indices + p_vert_in_h[vert_at(ic, ik, ib)] = static_cast<TypeParam>(1.0 + ic + ik * 0.1 + ib * 0.01); + } + } + } + + // Initialize cell connectivity indices with specific pattern + for (int ib = 0; ib < nblks_c; ++ib) { + for (int ic = 0; ic < nproma; ++ic) { + // Each cell connects to 3 vertices + for (int j = 0; j < 3; ++j) { + // Vertex indices with a pattern + cell_index_idx_h[idx_at(ic, ib, j)] = (ic + j) % nproma; + cell_index_blk_h[blk_at(ic, ib, j)] = (ib + j % 2) % nblks_v; + + // Interpolation coefficients that depend on indices + coeff_int_cells_h[coeff_at(ic, j, ib)] = static_cast<TypeParam>(1.0 / 3.0 + j * 0.01); + } + + // Initialize output to zero + for (int ik = 0; ik < nlev; ++ik) { + p_cell_out_h[cell_at(ic, ik, ib)] = static_cast<TypeParam>(0.0); + } + } + } + + // Copy to device + Kokkos::deep_copy(this->p_vert_in, p_vert_in_h); + Kokkos::deep_copy(this->cell_index_idx, cell_index_idx_h); + Kokkos::deep_copy(this->cell_index_blk, cell_index_blk_h); + Kokkos::deep_copy(this->coeff_int_cells, coeff_int_cells_h); + Kokkos::deep_copy(this->p_cell_out, p_cell_out_h); + + // Call the function under test verts2cells_scalar_lib<TypeParam>( this->p_vert_in.data(), this->cell_index_idx.data(), this->cell_index_blk.data(), this->coeff_int_cells.data(), - this->p_cell_out.data(), this->nblks_c, this->npromz_c, this->slev, - this->elev, this->nproma, this->nlev, this->nblks_v, this->lacc); - - // Check the outputs only for blocks in the range - // { [i_startblk, i_endblk], [slev,elev], [i_startidx_in, i_endidx_in] } - for (int block = this->i_startblk; block <= this->i_endblk; ++block) { - for (int level = this->slev; level < this->elev; ++level) { - for (int i = this->i_startidx; i < this->i_endidx; ++i) { - // Compute the linear index for a 3D array in column-major order: - size_t idx = - i + level * this->nproma + block * this->nproma * this->nlev; - // Since every contribution is 1 and there are 3 stencil points, - // expect 3. - EXPECT_NEAR(this->p_cell_out[idx], static_cast<TypeParam>(3), + this->p_cell_out.data(), nblks_c, npromz_c, this->slev, this->elev, + nproma, nlev, nblks_v, this->lacc); + + // Copy results back to host + Kokkos::deep_copy(p_cell_out_h, this->p_cell_out); + + // Expected results based on the specific test values + std::vector<TypeParam> expected_cells = { + 1.37677, 1.7201, 1.47977, 1.8231, 1.58277, 1.9261, + 1.3802, 1.72353, 1.4832, 1.82653, 1.5862, 1.92953 + }; + + // Verify results - check the same ranges as in the expected calculation + for (int jb = 0; jb < nblks_c; ++jb) { + for (int jk = this->slev; jk <= this->elev; ++jk) { + int nlen = (jb != nblks_c - 1) ? nproma : npromz_c; + int start_idx = (jb >= this->i_startblk && jb <= this->i_endblk) ? this->i_startidx : 0; + int end_idx = (jb >= this->i_startblk && jb <= this->i_endblk) ? this->i_endidx : nlen - 1; + + for (int jc = start_idx; jc <= end_idx; ++jc) { + EXPECT_NEAR(p_cell_out_h[cell_at(jc, jk, jb)], + expected_cells[cell_at(jc, jk, jb)], static_cast<TypeParam>(1e-5)) - << "Failure at block " << block << ", level " << level << ", index " - << i; + << "Failure at block " << jb << ", level " << jk << ", index " << jc; + } + } + } +} + +TYPED_TEST(InterpolationScalarSingleParamTest, Verts2CellsRandom) { + constexpr int nproma = this->nproma; + constexpr int nlev = this->nlev; + constexpr int nblks_v = this->nblks_v; + constexpr int nblks_c = this->nblks_c; + constexpr int npromz_c = this->npromz_c; + + // Define indexing helpers + const auto &vert_at = at<nproma, nlev, nblks_v>; + const auto &idx_at = at<nproma, nblks_c, 3>; + const auto &blk_at = at<nproma, nblks_c, 3>; + const auto &coeff_at = at<nproma, 3, nblks_c>; + const auto &cell_at = at<nproma, nlev, nblks_c>; + + // Create host mirror views + auto p_vert_in_h = Kokkos::create_mirror_view(this->p_vert_in); + auto cell_index_idx_h = Kokkos::create_mirror_view(this->cell_index_idx); + auto cell_index_blk_h = Kokkos::create_mirror_view(this->cell_index_blk); + auto coeff_int_cells_h = Kokkos::create_mirror_view(this->coeff_int_cells); + auto p_cell_out_h = Kokkos::create_mirror_view(this->p_cell_out); + + // Use fixed seed for reproducibility + std::mt19937 gen(42); + std::uniform_int_distribution<int> vert_distrib(0, nproma - 1); + std::uniform_int_distribution<int> block_distrib(0, nblks_v - 1); + std::uniform_real_distribution<TypeParam> real_distrib(0.01, 1.0); + + // Initialize with random values + for (int ib = 0; ib < nblks_v; ++ib) { + for (int ik = 0; ik < nlev; ++ik) { + for (int ic = 0; ic < nproma; ++ic) { + p_vert_in_h[vert_at(ic, ik, ib)] = real_distrib(gen); + } + } + } + + // Initialize cell connectivity indices with random values + for (int ib = 0; ib < nblks_c; ++ib) { + for (int ic = 0; ic < nproma; ++ic) { + // Each cell connects to 3 vertices + for (int j = 0; j < 3; ++j) { + cell_index_idx_h[idx_at(ic, ib, j)] = vert_distrib(gen); + cell_index_blk_h[blk_at(ic, ib, j)] = block_distrib(gen); + + // Random interpolation coefficients + coeff_int_cells_h[coeff_at(ic, j, ib)] = real_distrib(gen) / 3.0; // Scaled to ensure reasonable sums + } + + // Initialize output to zero + for (int ik = 0; ik < nlev; ++ik) { + p_cell_out_h[cell_at(ic, ik, ib)] = static_cast<TypeParam>(0.0); + } + } + } + + // Copy to device + Kokkos::deep_copy(this->p_vert_in, p_vert_in_h); + Kokkos::deep_copy(this->cell_index_idx, cell_index_idx_h); + Kokkos::deep_copy(this->cell_index_blk, cell_index_blk_h); + Kokkos::deep_copy(this->coeff_int_cells, coeff_int_cells_h); + Kokkos::deep_copy(this->p_cell_out, p_cell_out_h); + + // Call the function under test + verts2cells_scalar_lib<TypeParam>( + this->p_vert_in.data(), this->cell_index_idx.data(), + this->cell_index_blk.data(), this->coeff_int_cells.data(), + this->p_cell_out.data(), nblks_c, npromz_c, this->slev, this->elev, + nproma, nlev, nblks_v, this->lacc); + + // Copy results back to host + Kokkos::deep_copy(p_cell_out_h, this->p_cell_out); + + // Prepare expected results storage + using host_space = Kokkos::HostSpace; + Kokkos::View<TypeParam***, host_space> expected_cells("expected_cells", nproma, nlev, nblks_c); + + // Compute expected values + for (int jb = 0; jb < nblks_c; ++jb) { + for (int jk = this->slev; jk <= this->elev; ++jk) { + int nlen = (jb != nblks_c - 1) ? nproma : npromz_c; + for (int jc = 0; jc < nlen; ++jc) { + expected_cells(jc, jk, jb) = 0.0; + + for (int j = 0; j < 3; ++j) { + int vert_index = cell_index_idx_h[idx_at(jc, jb, j)]; + int vert_block = cell_index_blk_h[blk_at(jc, jb, j)]; + TypeParam coeff = coeff_int_cells_h[coeff_at(jc, j, jb)]; + + expected_cells(jc, jk, jb) += coeff * p_vert_in_h[vert_at(vert_index, jk, vert_block)]; + } + } + } + } + + TypeParam tol = std::is_same<TypeParam, float>::value ? + static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13); + + // Verify results + for (int jb = 0; jb < nblks_c; ++jb) { + for (int jk = this->slev; jk <= this->elev; ++jk) { + int nlen = (jb != nblks_c - 1) ? nproma : npromz_c; + for (int jc = 0; jc < nlen; ++jc) { + EXPECT_NEAR(p_cell_out_h[cell_at(jc, jk, jb)], + expected_cells(jc, jk, jb), tol) + << "Failure at block " << jb << ", level " << jk << ", index " << jc; } } } @@ -289,48 +955,226 @@ TYPED_TEST(InterpolationScalarTypedTestFixture, Verts2Cells) { // //////////////////////////////////////////////////////////////////////////////// -TYPED_TEST(InterpolationScalarTypedTestFixture, AvgLib) { +TYPED_TEST(InterpolationScalarSingleParamTest, CellAvgLibSpecific) { + constexpr int nproma = this->nproma; + constexpr int nlev = this->nlev; + constexpr int nblks_c = this->nblks_c; + + // Define indexing helpers + const auto &psi_at = at<nproma, nlev, nblks_c>; + const auto &idx_at = at<nproma, nblks_c, 3>; + const auto &blk_at = at<nproma, nblks_c, 3>; + const auto &coeff_at = at<nproma, 4, nblks_c>; // 4 coefficients (self + 3 neighbors) + const auto &avg_at = at<nproma, nlev, nblks_c>; + + // Create host mirror views + auto psi_c_h = Kokkos::create_mirror_view(this->psi_c); + auto cell_neighbor_idx_h = Kokkos::create_mirror_view(this->cell_neighbor_idx); + auto cell_neighbor_blk_h = Kokkos::create_mirror_view(this->cell_neighbor_blk); + auto avg_coeff_h = Kokkos::create_mirror_view(this->avg_coeff); + auto avg_psi_c_h = Kokkos::create_mirror_view(this->avg_psi_c); + + // Initialize with index-based test values + for (int ib = 0; ib < nblks_c; ++ib) { + for (int ik = 0; ik < nlev; ++ik) { + for (int ic = 0; ic < nproma; ++ic) { + // Value depends on all three indices + psi_c_h[psi_at(ic, ik, ib)] = static_cast<TypeParam>(1.0 + ic + ik * 0.1 + ib * 0.01); + } + } + } - // Call the function - cell_avg_lib<TypeParam>(this->psi_c.data(), this->cell_neighbor_idx.data(), - this->cell_neighbor_blk.data(), - this->avg_coeff.data(), this->avg_psi_c.data(), - this->i_startblk, this->i_endblk, this->i_startidx, - this->i_endidx, this->slev, this->elev, this->nproma, - this->nlev, this->nblks_c, this->lacc); - - // Check the outputs only for blocks in the range - // { [i_startblk, i_endblk], [slev,elev], [i_startidx_in, i_endidx_in] } - for (int block = this->i_startblk; block <= this->i_endblk; ++block) { - for (int level = this->slev; level < this->elev; ++level) { - for (int i = this->i_startidx; i < this->i_endidx; ++i) { - // Compute the linear index for a 3D array in column-major order: - size_t idx = - i + level * this->nproma + block * this->nproma * this->nlev; - // Since every contribution is 1 and there are 4 stencil points, - // expect 4. - EXPECT_NEAR(this->avg_psi_c[idx], static_cast<TypeParam>(4), + // Initialize cell neighbor indices with specific pattern + for (int ib = 0; ib < nblks_c; ++ib) { + for (int ic = 0; ic < nproma; ++ic) { + // Each cell has 3 neighboring cells + for (int j = 0; j < 3; ++j) { + // Neighbor indices with a pattern + cell_neighbor_idx_h[idx_at(ic, ib, j)] = (ic + j + 1) % nproma; + cell_neighbor_blk_h[blk_at(ic, ib, j)] = (ib + j % 2) % nblks_c; + } + + // Averaging coefficients - one for the cell itself and one for each neighbor + avg_coeff_h[coeff_at(ic, 0, ib)] = static_cast<TypeParam>(0.4); // Self weight + avg_coeff_h[coeff_at(ic, 1, ib)] = static_cast<TypeParam>(0.2); // First neighbor + avg_coeff_h[coeff_at(ic, 2, ib)] = static_cast<TypeParam>(0.2); // Second neighbor + avg_coeff_h[coeff_at(ic, 3, ib)] = static_cast<TypeParam>(0.2); // Third neighbor + + // Initialize output to zero + for (int ik = 0; ik < nlev; ++ik) { + avg_psi_c_h[avg_at(ic, ik, ib)] = static_cast<TypeParam>(0.0); + } + } + } + + // Copy to device + Kokkos::deep_copy(this->psi_c, psi_c_h); + Kokkos::deep_copy(this->cell_neighbor_idx, cell_neighbor_idx_h); + Kokkos::deep_copy(this->cell_neighbor_blk, cell_neighbor_blk_h); + Kokkos::deep_copy(this->avg_coeff, avg_coeff_h); + Kokkos::deep_copy(this->avg_psi_c, avg_psi_c_h); + + // Call the function under test + cell_avg_lib<TypeParam>( + this->psi_c.data(), this->cell_neighbor_idx.data(), + this->cell_neighbor_blk.data(), this->avg_coeff.data(), + this->avg_psi_c.data(), this->i_startblk, this->i_endblk, + this->i_startidx, this->i_endidx, this->slev, this->elev, + nproma, nlev, nblks_c, this->lacc); + + // Copy results back to host + Kokkos::deep_copy(avg_psi_c_h, this->avg_psi_c); + + // Expected results based on the specific test values + std::vector<TypeParam> expected_avg = { + 1.402, 1.602, 1.502, 1.702, 1.602, 1.802, + 1.408, 1.608, 1.508, 1.708, 1.608, 1.808 + }; + + // Verify results + for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) { + for (int jk = this->slev; jk <= this->elev; ++jk) { + for (int jc = this->i_startidx; jc <= this->i_endidx; ++jc) { + EXPECT_NEAR(avg_psi_c_h[avg_at(jc, jk, jb)], + expected_avg[avg_at(jc, jk, jb)], static_cast<TypeParam>(1e-5)) - << "Failure at block " << block << ", level " << level << ", index " - << i; + << "Failure at block " << jb << ", level " << jk << ", index " << jc; + } + } + } +} + +TYPED_TEST(InterpolationScalarSingleParamTest, CellAvgLibRandom) { + constexpr int nproma = this->nproma; + constexpr int nlev = this->nlev; + constexpr int nblks_c = this->nblks_c; + + // Define indexing helpers + const auto &psi_at = at<nproma, nlev, nblks_c>; + const auto &idx_at = at<nproma, nblks_c, 3>; + const auto &blk_at = at<nproma, nblks_c, 3>; + const auto &coeff_at = at<nproma, 4, nblks_c>; // 4 coefficients (self + 3 neighbors) + const auto &avg_at = at<nproma, nlev, nblks_c>; + + // Create host mirror views + auto psi_c_h = Kokkos::create_mirror_view(this->psi_c); + auto cell_neighbor_idx_h = Kokkos::create_mirror_view(this->cell_neighbor_idx); + auto cell_neighbor_blk_h = Kokkos::create_mirror_view(this->cell_neighbor_blk); + auto avg_coeff_h = Kokkos::create_mirror_view(this->avg_coeff); + auto avg_psi_c_h = Kokkos::create_mirror_view(this->avg_psi_c); + + // Use fixed seed for reproducibility + std::mt19937 gen(42); + std::uniform_int_distribution<int> cell_distrib(0, nproma - 1); + std::uniform_int_distribution<int> block_distrib(0, nblks_c - 1); + std::uniform_real_distribution<TypeParam> real_distrib(0.01, 1.0); + std::uniform_real_distribution<TypeParam> coeff_distrib(0.01, 0.5); // Keep coefficients reasonable + + // Initialize with random values + for (int ib = 0; ib < nblks_c; ++ib) { + for (int ik = 0; ik < nlev; ++ik) { + for (int ic = 0; ic < nproma; ++ic) { + psi_c_h[psi_at(ic, ik, ib)] = real_distrib(gen); + } + } + } + + // Initialize cell neighbor indices with random values + for (int ib = 0; ib < nblks_c; ++ib) { + for (int ic = 0; ic < nproma; ++ic) { + // Each cell has 3 neighboring cells + for (int j = 0; j < 3; ++j) { + cell_neighbor_idx_h[idx_at(ic, ib, j)] = cell_distrib(gen); + cell_neighbor_blk_h[blk_at(ic, ib, j)] = block_distrib(gen); + } + + avg_coeff_h[coeff_at(ic, 0, ib)] = static_cast<TypeParam>(coeff_distrib(gen)); + avg_coeff_h[coeff_at(ic, 1, ib)] = static_cast<TypeParam>(coeff_distrib(gen)); + avg_coeff_h[coeff_at(ic, 2, ib)] = static_cast<TypeParam>(coeff_distrib(gen)); + avg_coeff_h[coeff_at(ic, 3, ib)] = static_cast<TypeParam>(coeff_distrib(gen)); + + // Initialize output to zero + for (int ik = 0; ik < nlev; ++ik) { + avg_psi_c_h[avg_at(ic, ik, ib)] = static_cast<TypeParam>(0.0); + } + } + } + + // Copy to device + Kokkos::deep_copy(this->psi_c, psi_c_h); + Kokkos::deep_copy(this->cell_neighbor_idx, cell_neighbor_idx_h); + Kokkos::deep_copy(this->cell_neighbor_blk, cell_neighbor_blk_h); + Kokkos::deep_copy(this->avg_coeff, avg_coeff_h); + Kokkos::deep_copy(this->avg_psi_c, avg_psi_c_h); + + // Call the function under test + cell_avg_lib<TypeParam>( + this->psi_c.data(), this->cell_neighbor_idx.data(), + this->cell_neighbor_blk.data(), this->avg_coeff.data(), + this->avg_psi_c.data(), this->i_startblk, this->i_endblk, + this->i_startidx, this->i_endidx, this->slev, this->elev, + nproma, nlev, nblks_c, this->lacc); + + // Copy results back to host + Kokkos::deep_copy(avg_psi_c_h, this->avg_psi_c); + + // Prepare expected results storage + using host_space = Kokkos::HostSpace; + Kokkos::View<TypeParam***, host_space> expected_avg("expected_avg", nproma, nlev, nblks_c); + + // Compute expected values + for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) { + for (int jk = this->slev; jk <= this->elev; ++jk) { + for (int jc = this->i_startidx; jc <= this->i_endidx; ++jc) { + // Self contribution + expected_avg(jc, jk, jb) = + psi_c_h[psi_at(jc, jk, jb)] * avg_coeff_h[coeff_at(jc, 0, jb)]; + + // Neighbor contributions + for (int j = 0; j < 3; ++j) { + int neighbor_idx = cell_neighbor_idx_h[idx_at(jc, jb, j)]; + int neighbor_blk = cell_neighbor_blk_h[blk_at(jc, jb, j)]; + TypeParam coeff = avg_coeff_h[coeff_at(jc, j+1, jb)]; + + expected_avg(jc, jk, jb) += + psi_c_h[psi_at(neighbor_idx, jk, neighbor_blk)] * coeff; + } + } + } + } + + TypeParam tol = std::is_same<TypeParam, float>::value ? + static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13); + + // Verify results + for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) { + for (int jk = this->slev; jk <= this->elev; ++jk) { + for (int jc = this->i_startidx; jc <= this->i_endidx; ++jc) { + EXPECT_NEAR(avg_psi_c_h[avg_at(jc, jk, jb)], + expected_avg(jc, jk, jb), tol) + << "Failure at block " << jb << ", level " << jk << ", index " << jc; } } } } template <typename TypePair> -class InterpolationScalarMixedTestFixture : public ::testing::Test, +class InterpolationScalarDoubleParamTest : public ::testing::Test, public interp_dimensions { -public: + protected: using InType = typename TypePair::in_type; using OutType = typename TypePair::out_type; + // Using Kokkos execution and memory spaces + using exec_space = Kokkos::DefaultExecutionSpace; + using memory_space = exec_space::memory_space; + // Arrays used for cells2edges - std::vector<InType> p_cell_in; // Dimensions: (nproma, nlev, nblks_c) - std::vector<int> edge_cell_idx; // Dimensions: (nproma, nblks_e, 2) - std::vector<int> edge_cell_blk; // Dimensions: (nproma, nblks_e, 2) - std::vector<OutType> coeff_int_edges; // Dimensions: (nproma, 2, nblks_e) - std::vector<OutType> p_edge_out; // Dimensions: (nproma, nlev, nblks_e) + Kokkos::View<InType*, memory_space> p_cell_in; + Kokkos::View<int*, memory_space> edge_cell_idx; + Kokkos::View<int*, memory_space> edge_cell_blk; + Kokkos::View<OutType*, memory_space> coeff_int_edges; + Kokkos::View<OutType*, memory_space> p_edge_out; // Further parameters for cells2edges const int patch_id = 0; @@ -342,41 +1186,31 @@ public: std::vector<int> i_endidx_in; // Dimensions: (2) // Arrays used for cells2verts - std::vector<int> vert_cell_idx; // Dimensions: (nproma, nblks_v, 6) - std::vector<int> vert_cell_blk; // Dimensions: (nproma, nblks_v, 6) - std::vector<OutType> coeff_int_verts; // Dimensions: (nproma, 6, nblks_v) - std::vector<OutType> p_vert_out; // Dimensions: (nproma, nlev, nblks_v) - - InterpolationScalarMixedTestFixture() { - // Allocate and initialize arrays needed for cells2edges - p_cell_in.resize(num_elements_3d<InType>(nproma, nlev, nblks_c), - static_cast<InType>(1)); - edge_cell_idx.resize(num_elements_3d<int>(nproma, nblks_e, 2), 1); - edge_cell_blk.resize(num_elements_3d<int>(nproma, nblks_e, 2), 0); - coeff_int_edges.resize(num_elements_3d<InType>(nproma, 2, nblks_e), - static_cast<OutType>(1)); - - p_edge_out.resize(num_elements_3d<OutType>(nproma, nlev, nblks_e), - static_cast<OutType>(0)); - + Kokkos::View<int*, memory_space> vert_cell_idx; + Kokkos::View<int*, memory_space> vert_cell_blk; + Kokkos::View<OutType*, memory_space> coeff_int_verts; + Kokkos::View<OutType*, memory_space> p_vert_out; + + InterpolationScalarDoubleParamTest() + : p_cell_in("p_cell_in", nproma * nlev * nblks_c), + edge_cell_idx("edge_cell_idx", nproma * nblks_e * 2), + edge_cell_blk("edge_cell_blk", nproma * nblks_e * 2), + coeff_int_edges("coeff_int_edges", nproma * 2 * nblks_e), + p_edge_out("p_edge_out", nproma * nlev * nblks_e), + vert_cell_idx("vert_cell_idx", nproma * nblks_v * 6), + vert_cell_blk("vert_cell_blk", nproma * nblks_v * 6), + coeff_int_verts("coeff_int_verts", nproma * 6 * nblks_v), + p_vert_out("p_vert_out", nproma * nlev * nblks_v) + { // Allocate neighbour indexes for cells2edges i_startblk_in.resize(2, i_startblk); i_endblk_in.resize(2, i_endblk); i_startidx_in.resize(2, i_startidx); i_endidx_in.resize(2, i_endidx); - - // Allocate & Initialize arrays needed for cells2verts - vert_cell_idx.resize(num_elements_3d<int>(nproma, nblks_v, 6), 1); - vert_cell_blk.resize(num_elements_3d<int>(nproma, nblks_v, 6), 0); - coeff_int_verts.resize(num_elements_3d<InType>(nproma, 6, nblks_v), - static_cast<OutType>(1)); - - p_vert_out.resize(num_elements_3d<OutType>(nproma, nlev, nblks_v), - static_cast<OutType>(0)); } }; -TYPED_TEST_SUITE(InterpolationScalarMixedTestFixture, MixedTypesSP2DP); +TYPED_TEST_SUITE(InterpolationScalarDoubleParamTest, MixedTypesSP2DP); //////////////////////////////////////////////////////////////////////////////// // @@ -384,34 +1218,234 @@ TYPED_TEST_SUITE(InterpolationScalarMixedTestFixture, MixedTypesSP2DP); // //////////////////////////////////////////////////////////////////////////////// -TYPED_TEST(InterpolationScalarMixedTestFixture, Cells2Edges) { +TYPED_TEST(InterpolationScalarDoubleParamTest, Cells2EdgesSpecific) { using InType = typename TestFixture::InType; using OutType = typename TestFixture::OutType; + + constexpr int nproma = this->nproma; + constexpr int nlev = this->nlev; + constexpr int nblks_c = this->nblks_c; + constexpr int nblks_e = this->nblks_e; + + // Define indexing helpers + const auto &cell_at = at<nproma, nlev, nblks_c>; + const auto &edge_idx_at = at<nproma, nblks_e, 2>; + const auto &edge_blk_at = at<nproma, nblks_e, 2>; + const auto &coeff_at = at<nproma, 2, nblks_e>; + const auto &edge_at = at<nproma, nlev, nblks_e>; + + // Create host mirror views + auto p_cell_in_h = Kokkos::create_mirror_view(this->p_cell_in); + auto edge_cell_idx_h = Kokkos::create_mirror_view(this->edge_cell_idx); + auto edge_cell_blk_h = Kokkos::create_mirror_view(this->edge_cell_blk); + auto coeff_int_edges_h = Kokkos::create_mirror_view(this->coeff_int_edges); + auto p_edge_out_h = Kokkos::create_mirror_view(this->p_edge_out); + + // Initialize with index-based test values + for (int ib = 0; ib < nblks_c; ++ib) { + for (int ik = 0; ik < nlev; ++ik) { + for (int ic = 0; ic < nproma; ++ic) { + // Value depends on all three indices + p_cell_in_h[cell_at(ic, ik, ib)] = static_cast<InType>(1.0 + ic + ik * 0.1 + ib * 0.01); + } + } + } - // Call the function + // Initialize edge connectivity indices with specific pattern + for (int ib = 0; ib < nblks_e; ++ib) { + for (int ic = 0; ic < nproma; ++ic) { + // Each edge connects to 2 cells + edge_cell_idx_h[edge_idx_at(ic, ib, 0)] = ic % nproma; // First cell index + edge_cell_idx_h[edge_idx_at(ic, ib, 1)] = (ic + 1) % nproma; // Second cell index + + edge_cell_blk_h[edge_blk_at(ic, ib, 0)] = ib % nblks_c; // First cell block + edge_cell_blk_h[edge_blk_at(ic, ib, 1)] = (ib + 1) % nblks_c; // Second cell block + + // Interpolation coefficients that depend on indices + coeff_int_edges_h[coeff_at(ic, 0, ib)] = static_cast<OutType>(0.5 + ic * 0.01); + coeff_int_edges_h[coeff_at(ic, 1, ib)] = static_cast<OutType>(0.5 - ic * 0.01); + + // Initialize output to zero + for (int ik = 0; ik < nlev; ++ik) { + p_edge_out_h[edge_at(ic, ik, ib)] = static_cast<OutType>(0.0); + } + } + } + + // Copy to device + Kokkos::deep_copy(this->p_cell_in, p_cell_in_h); + Kokkos::deep_copy(this->edge_cell_idx, edge_cell_idx_h); + Kokkos::deep_copy(this->edge_cell_blk, edge_cell_blk_h); + Kokkos::deep_copy(this->coeff_int_edges, coeff_int_edges_h); + Kokkos::deep_copy(this->p_edge_out, p_edge_out_h); + + // Call the function under test cells2edges_scalar_lib<InType, OutType>( this->p_cell_in.data(), this->edge_cell_idx.data(), this->edge_cell_blk.data(), this->coeff_int_edges.data(), this->p_edge_out.data(), this->i_startblk_in.data(), this->i_endblk_in.data(), this->i_startidx_in.data(), - this->i_endidx_in.data(), this->slev, this->elev, this->nproma, - this->nlev, this->nblks_c, this->nblks_e, this->patch_id, + this->i_endidx_in.data(), this->slev, this->elev, nproma, + nlev, nblks_c, nblks_e, this->patch_id, this->l_limited_area, this->lfill_latbc, this->lacc); - // Check the outputs only for blocks in the range - // { [i_startblk, i_endblk], [slev,elev], [i_startidx_in, i_endidx_in] } - for (int block = this->i_startblk; block <= this->i_endblk; ++block) { - for (int level = this->slev; level < this->elev; ++level) { - for (int i = this->i_startidx; i < this->i_endidx; ++i) { - // Compute the linear index for a 3D array in column-major order: - size_t idx = - i + level * this->nproma + block * this->nproma * this->nlev; - // Since every contribution is 1 and there are 2 stencil points, - // expect 2. - EXPECT_NEAR(this->p_edge_out[idx], static_cast<OutType>(2), + // Copy results back to host + Kokkos::deep_copy(p_edge_out_h, this->p_edge_out); + + int i_startblk = this->i_startblk_in[1]; + int i_endblk = this->i_endblk_in[1]; + int i_startidx_range = this->i_startidx_in[1]; + int i_endidx_range = this->i_endidx_in[1]; + + // Expected results based on the specific test values + std::vector<OutType> expected_edges = { + 1.505, 1.5149, 1.605, 1.6149, 1.705, 1.7149, + 1.505, 1.5151, 1.605, 1.6151, 1.705, 1.7151 + }; + + // Verify results + for (int jb = i_startblk; jb <= i_endblk; ++jb) { + int i_startidx, i_endidx; + get_indices_e_lib(i_startidx_range, i_endidx_range, nproma, jb, + i_startblk, i_endblk, i_startidx, i_endidx); + + for (int jk = this->slev; jk <= this->elev; ++jk) { + for (int je = i_startidx; je <= i_endidx; ++je) { + EXPECT_NEAR(p_edge_out_h[edge_at(je, jk, jb)], + expected_edges[edge_at(je, jk, jb)], static_cast<OutType>(1e-5)) - << "Failure at block " << block << ", level " << level << ", index " - << i; + << "Failure at block " << jb << ", level " << jk << ", index " << je; + } + } + } +} + +TYPED_TEST(InterpolationScalarDoubleParamTest, Cells2EdgesRandom) { + using InType = typename TestFixture::InType; + using OutType = typename TestFixture::OutType; + + constexpr int nproma = this->nproma; + constexpr int nlev = this->nlev; + constexpr int nblks_c = this->nblks_c; + constexpr int nblks_e = this->nblks_e; + + // Define indexing helpers + const auto &cell_at = at<nproma, nlev, nblks_c>; + const auto &edge_idx_at = at<nproma, nblks_e, 2>; + const auto &edge_blk_at = at<nproma, nblks_e, 2>; + const auto &coeff_at = at<nproma, 2, nblks_e>; + const auto &edge_at = at<nproma, nlev, nblks_e>; + + // Create host mirror views + auto p_cell_in_h = Kokkos::create_mirror_view(this->p_cell_in); + auto edge_cell_idx_h = Kokkos::create_mirror_view(this->edge_cell_idx); + auto edge_cell_blk_h = Kokkos::create_mirror_view(this->edge_cell_blk); + auto coeff_int_edges_h = Kokkos::create_mirror_view(this->coeff_int_edges); + auto p_edge_out_h = Kokkos::create_mirror_view(this->p_edge_out); + + // Use fixed seed for reproducibility + std::mt19937 gen(42); + std::uniform_int_distribution<int> cell_distrib(0, nproma - 1); + std::uniform_int_distribution<int> block_distrib(0, nblks_c - 1); + std::uniform_real_distribution<double> real_distrib(0.01, 1.0); + + // Initialize with random values + for (int ib = 0; ib < nblks_c; ++ib) { + for (int ik = 0; ik < nlev; ++ik) { + for (int ic = 0; ic < nproma; ++ic) { + p_cell_in_h[cell_at(ic, ik, ib)] = static_cast<InType>(real_distrib(gen)); + } + } + } + + // Initialize edge connectivity indices with random values + for (int ib = 0; ib < nblks_e; ++ib) { + for (int ic = 0; ic < nproma; ++ic) { + // Each edge connects to 2 cells + edge_cell_idx_h[edge_idx_at(ic, ib, 0)] = cell_distrib(gen); + edge_cell_idx_h[edge_idx_at(ic, ib, 1)] = cell_distrib(gen); + + edge_cell_blk_h[edge_blk_at(ic, ib, 0)] = block_distrib(gen); + edge_cell_blk_h[edge_blk_at(ic, ib, 1)] = block_distrib(gen); + + coeff_int_edges_h[coeff_at(ic, 0, ib)] = static_cast<OutType>(real_distrib(gen)); + coeff_int_edges_h[coeff_at(ic, 1, ib)] = static_cast<OutType>(real_distrib(gen)); + + // Initialize output to zero + for (int ik = 0; ik < nlev; ++ik) { + p_edge_out_h[edge_at(ic, ik, ib)] = static_cast<OutType>(0.0); + } + } + } + + // Copy to device + Kokkos::deep_copy(this->p_cell_in, p_cell_in_h); + Kokkos::deep_copy(this->edge_cell_idx, edge_cell_idx_h); + Kokkos::deep_copy(this->edge_cell_blk, edge_cell_blk_h); + Kokkos::deep_copy(this->coeff_int_edges, coeff_int_edges_h); + Kokkos::deep_copy(this->p_edge_out, p_edge_out_h); + + // Call the function under test + cells2edges_scalar_lib<InType, OutType>( + this->p_cell_in.data(), this->edge_cell_idx.data(), + this->edge_cell_blk.data(), this->coeff_int_edges.data(), + this->p_edge_out.data(), this->i_startblk_in.data(), + this->i_endblk_in.data(), this->i_startidx_in.data(), + this->i_endidx_in.data(), this->slev, this->elev, nproma, + nlev, nblks_c, nblks_e, this->patch_id, + this->l_limited_area, this->lfill_latbc, this->lacc); + + // Copy results back to host + Kokkos::deep_copy(p_edge_out_h, this->p_edge_out); + + // Prepare expected results storage + using host_space = Kokkos::HostSpace; + Kokkos::View<OutType***, host_space> expected_edges("expected_edges", nproma, nlev, nblks_e); + + // Since we're not testing the lateral boundary condition filling + // (this->l_limited_area == false && this->lfill_latbc == false), + // we only need to check the blocks in i_startblk_in[1] to i_endblk_in[1] + int i_startblk = this->i_startblk_in[1]; + int i_endblk = this->i_endblk_in[1]; + int i_startidx_range = this->i_startidx_in[1]; + int i_endidx_range = this->i_endidx_in[1]; + + // Compute expected values + for (int jb = i_startblk; jb <= i_endblk; ++jb) { + // Get the actual indices to process for this block + int i_startidx, i_endidx; + get_indices_e_lib(i_startidx_range, i_endidx_range, nproma, jb, + i_startblk, i_endblk, i_startidx, i_endidx); + + for (int jk = this->slev; jk <= this->elev; ++jk) { + for (int je = i_startidx; je <= i_endidx; ++je) { + expected_edges(je, jk, jb) = + static_cast<OutType>(coeff_int_edges_h[coeff_at(je, 0, jb)] * + p_cell_in_h[cell_at(edge_cell_idx_h[edge_idx_at(je, jb, 0)], + jk, + edge_cell_blk_h[edge_blk_at(je, jb, 0)])]) + + static_cast<OutType>(coeff_int_edges_h[coeff_at(je, 1, jb)] * + p_cell_in_h[cell_at(edge_cell_idx_h[edge_idx_at(je, jb, 1)], + jk, + edge_cell_blk_h[edge_blk_at(je, jb, 1)])]); + } + } + } + + OutType tol = std::is_same<OutType, float>::value ? + static_cast<OutType>(1e-5) : static_cast<OutType>(1e-13); + + // Verify results + for (int jb = i_startblk; jb <= i_endblk; ++jb) { + int i_startidx, i_endidx; + get_indices_e_lib(i_startidx_range, i_endidx_range, nproma, jb, + i_startblk, i_endblk, i_startidx, i_endidx); + + for (int jk = this->slev; jk <= this->elev; ++jk) { + for (int je = i_startidx; je <= i_endidx; ++je) { + EXPECT_NEAR(p_edge_out_h[edge_at(je, jk, jb)], + expected_edges(je, jk, jb), tol) + << "Failure at block " << jb << ", level " << jk << ", index " << je; } } } @@ -423,31 +1457,217 @@ TYPED_TEST(InterpolationScalarMixedTestFixture, Cells2Edges) { // //////////////////////////////////////////////////////////////////////////////// -TYPED_TEST(InterpolationScalarMixedTestFixture, Cells2Verts) { +TYPED_TEST(InterpolationScalarDoubleParamTest, Cells2VertsSpecific) { using InType = typename TestFixture::InType; using OutType = typename TestFixture::OutType; + + constexpr int nproma = this->nproma; + constexpr int nlev = this->nlev; + constexpr int nblks_c = this->nblks_c; + constexpr int nblks_v = this->nblks_v; + + // Define indexing helpers + const auto &cell_at = at<nproma, nlev, nblks_c>; + const auto &idx_at = at<nproma, nblks_v, 6>; + const auto &blk_at = at<nproma, nblks_v, 6>; + const auto &coeff_at = at<nproma, 6, nblks_v>; + const auto &vert_at = at<nproma, nlev, nblks_v>; + + // Create host mirror views + auto p_cell_in_h = Kokkos::create_mirror_view(this->p_cell_in); + auto vert_cell_idx_h = Kokkos::create_mirror_view(this->vert_cell_idx); + auto vert_cell_blk_h = Kokkos::create_mirror_view(this->vert_cell_blk); + auto coeff_int_verts_h = Kokkos::create_mirror_view(this->coeff_int_verts); + auto p_vert_out_h = Kokkos::create_mirror_view(this->p_vert_out); + + // Initialize with index-based test values + for (int ib = 0; ib < nblks_c; ++ib) { + for (int ik = 0; ik < nlev; ++ik) { + for (int ic = 0; ic < nproma; ++ic) { + // Value depends on all three indices + p_cell_in_h[cell_at(ic, ik, ib)] = static_cast<InType>(1.0 + ic + ik * 0.1 + ib * 0.01); + } + } + } + // Initialize vertex connectivity indices with specific pattern + for (int ib = 0; ib < nblks_v; ++ib) { + for (int ic = 0; ic < nproma; ++ic) { + // Each vertex is connected to 6 cells + for (int j = 0; j < 6; ++j) { + // Cell indices with a pattern + vert_cell_idx_h[idx_at(ic, ib, j)] = (ic + j) % nproma; + vert_cell_blk_h[blk_at(ic, ib, j)] = (ib + j % nblks_c) % nblks_c; + + // Interpolation coefficients that depend on indices + coeff_int_verts_h[coeff_at(ic, j, ib)] = static_cast<OutType>(1.0 / 6.0 + j * 0.01); + } + + // Initialize output to zero + for (int ik = 0; ik < nlev; ++ik) { + p_vert_out_h[vert_at(ic, ik, ib)] = static_cast<OutType>(0.0); + } + } + } + + // Copy to device + Kokkos::deep_copy(this->p_cell_in, p_cell_in_h); + Kokkos::deep_copy(this->vert_cell_idx, vert_cell_idx_h); + Kokkos::deep_copy(this->vert_cell_blk, vert_cell_blk_h); + Kokkos::deep_copy(this->coeff_int_verts, coeff_int_verts_h); + Kokkos::deep_copy(this->p_vert_out, p_vert_out_h); + + // Call the function under test cells2verts_scalar_lib<InType, OutType>( this->p_cell_in.data(), this->vert_cell_idx.data(), this->vert_cell_blk.data(), this->coeff_int_verts.data(), this->p_vert_out.data(), this->i_startblk, this->i_endblk, - this->i_startidx, this->i_endidx, this->slev, this->elev, this->nproma, - this->nlev, this->nblks_c, this->nblks_v, this->lacc, this->acc_async); + this->i_startidx, this->i_endidx, this->slev, this->elev, + nproma, nlev, nblks_c, nblks_v, this->lacc, this->acc_async); + + // Copy results back to host + Kokkos::deep_copy(p_vert_out_h, this->p_vert_out); + + // Expected results based on the specific test values + std::vector<OutType> expected_verts = { + 1.7459, 1.7159, 1.8609, 1.8309, 1.9759, 1.9459, + 1.7456, 1.7156, 1.8606, 1.8306, 1.9756, 1.9456 + }; + + // Verify results + for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) { + int i_startidx, i_endidx; + get_indices_v_lib(this->i_startidx, this->i_endidx, nproma, jb, + this->i_startblk, this->i_endblk, i_startidx, i_endidx); + + for (int jk = this->slev; jk <= this->elev; ++jk) { + for (int jv = i_startidx; jv <= i_endidx; ++jv) { + EXPECT_NEAR(p_vert_out_h[vert_at(jv, jk, jb)], + expected_verts[vert_at(jv, jk, jb)], + static_cast<OutType>(1e-5)) + << "Failure at block " << jb << ", level " << jk << ", index " << jv; + } + } + } +} - // Check the outputs only for blocks in the range - // { [i_startblk, i_endblk], [slev,elev], [i_startidx_in, i_endidx_in] } - for (int block = this->i_startblk; block <= this->i_endblk; ++block) { - for (int level = this->slev; level < this->elev; ++level) { - for (int i = this->i_startidx; i < this->i_endidx; ++i) { - // Compute the linear index for a 3D array in column-major order: - size_t idx = - i + level * this->nproma + block * this->nproma * this->nlev; - // Since every contribution is 1 and there are 6 stencil points, - // expect 6. - EXPECT_NEAR(this->p_vert_out[idx], static_cast<OutType>(6), - static_cast<OutType>(1e-5)) - << "Failure at block " << block << ", level " << level << ", index " - << i; +TYPED_TEST(InterpolationScalarDoubleParamTest, Cells2VertsRandom) { + using InType = typename TestFixture::InType; + using OutType = typename TestFixture::OutType; + + constexpr int nproma = this->nproma; + constexpr int nlev = this->nlev; + constexpr int nblks_c = this->nblks_c; + constexpr int nblks_v = this->nblks_v; + + // Define indexing helpers + const auto &cell_at = at<nproma, nlev, nblks_c>; + const auto &idx_at = at<nproma, nblks_v, 6>; + const auto &blk_at = at<nproma, nblks_v, 6>; + const auto &coeff_at = at<nproma, 6, nblks_v>; + const auto &vert_at = at<nproma, nlev, nblks_v>; + + // Create host mirror views + auto p_cell_in_h = Kokkos::create_mirror_view(this->p_cell_in); + auto vert_cell_idx_h = Kokkos::create_mirror_view(this->vert_cell_idx); + auto vert_cell_blk_h = Kokkos::create_mirror_view(this->vert_cell_blk); + auto coeff_int_verts_h = Kokkos::create_mirror_view(this->coeff_int_verts); + auto p_vert_out_h = Kokkos::create_mirror_view(this->p_vert_out); + + // Use fixed seed for reproducibility + std::mt19937 gen(42); + std::uniform_int_distribution<int> cell_distrib(0, nproma - 1); + std::uniform_int_distribution<int> block_distrib(0, nblks_c - 1); + std::uniform_real_distribution<double> real_distrib(0.01, 1.0); + std::uniform_real_distribution<double> coeff_distrib(0.01, 0.3); // Keep coefficients reasonable + + // Initialize with random values + for (int ib = 0; ib < nblks_c; ++ib) { + for (int ik = 0; ik < nlev; ++ik) { + for (int ic = 0; ic < nproma; ++ic) { + p_cell_in_h[cell_at(ic, ik, ib)] = static_cast<InType>(real_distrib(gen)); + } + } + } + + // Initialize vertex connectivity indices with random values + for (int ib = 0; ib < nblks_v; ++ib) { + for (int ic = 0; ic < nproma; ++ic) { + // Each vertex is connected to 6 cells + for (int j = 0; j < 6; ++j) { + vert_cell_idx_h[idx_at(ic, ib, j)] = cell_distrib(gen); + vert_cell_blk_h[blk_at(ic, ib, j)] = block_distrib(gen); + + // Normalized coefficients + coeff_int_verts_h[coeff_at(ic, j, ib)] = static_cast<OutType>(coeff_distrib(gen)); + } + + // Initialize output to zero + for (int ik = 0; ik < nlev; ++ik) { + p_vert_out_h[vert_at(ic, ik, ib)] = static_cast<OutType>(0.0); + } + } + } + + // Copy to device + Kokkos::deep_copy(this->p_cell_in, p_cell_in_h); + Kokkos::deep_copy(this->vert_cell_idx, vert_cell_idx_h); + Kokkos::deep_copy(this->vert_cell_blk, vert_cell_blk_h); + Kokkos::deep_copy(this->coeff_int_verts, coeff_int_verts_h); + Kokkos::deep_copy(this->p_vert_out, p_vert_out_h); + + // Call the function under test + cells2verts_scalar_lib<InType, OutType>( + this->p_cell_in.data(), this->vert_cell_idx.data(), + this->vert_cell_blk.data(), this->coeff_int_verts.data(), + this->p_vert_out.data(), this->i_startblk, this->i_endblk, + this->i_startidx, this->i_endidx, this->slev, this->elev, + nproma, nlev, nblks_c, nblks_v, this->lacc, this->acc_async); + + // Copy results back to host + Kokkos::deep_copy(p_vert_out_h, this->p_vert_out); + + // Prepare expected results storage + using host_space = Kokkos::HostSpace; + Kokkos::View<OutType***, host_space> expected_verts("expected_verts", nproma, nlev, nblks_v); + + // Compute expected values + for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) { + // Get the actual indices to process for this block + int i_startidx, i_endidx; + get_indices_v_lib(this->i_startidx, this->i_endidx, nproma, jb, + this->i_startblk, this->i_endblk, i_startidx, i_endidx); + + for (int jk = this->slev; jk <= this->elev; ++jk) { + for (int jv = i_startidx; jv <= i_endidx; ++jv) { + expected_verts(jv, jk, jb) = static_cast<OutType>(0.0); + + for (int j = 0; j < 6; ++j) { + int cell_idx = vert_cell_idx_h[idx_at(jv, jb, j)]; + int cell_blk = vert_cell_blk_h[blk_at(jv, jb, j)]; + OutType coeff = coeff_int_verts_h[coeff_at(jv, j, jb)]; + + expected_verts(jv, jk, jb) += + static_cast<OutType>(coeff * p_cell_in_h[cell_at(cell_idx, jk, cell_blk)]); + } + } + } + } + + OutType tol = std::is_same<OutType, float>::value ? + static_cast<OutType>(1e-5) : static_cast<OutType>(1e-13); + + // Verify results + for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) { + int i_startidx, i_endidx; + get_indices_v_lib(this->i_startidx, this->i_endidx, nproma, jb, + this->i_startblk, this->i_endblk, i_startidx, i_endidx); + + for (int jk = this->slev; jk <= this->elev; ++jk) { + for (int jv = i_startidx; jv <= i_endidx; ++jv) { + EXPECT_NEAR(p_vert_out_h[vert_at(jv, jk, jb)], + expected_verts(jv, jk, jb), tol) + << "Failure at block " << jb << ", level " << jk << ", index " << jv; } } } @@ -460,7 +1680,7 @@ TYPED_TEST(InterpolationScalarMixedTestFixture, Cells2Verts) { //////////////////////////////////////////////////////////////////////////////// // The test for cells2verts_ri is similar to cells2verts, but is done here -// separtely to avoid as a differebt template instantiation is needed for the +// separtely to avoid as a different template instantiation is needed for the // function call template <typename Types> class Cells2vertsriScalarLibTestFixture : public testing::Test, @@ -469,36 +1689,102 @@ public: using InType = typename Types::in_type; using OutType = typename Types::out_type; + // Using Kokkos execution and memory spaces + using exec_space = Kokkos::DefaultExecutionSpace; + using memory_space = exec_space::memory_space; + // Arrays stored in std::vector. - std::vector<InType> p_cell_in; // Dimensions: (nproma, nlev, nblks_c) - std::vector<int> vert_cell_idx; // Dimensions: (nproma, nblks_v, 6) - std::vector<int> vert_cell_blk; // Dimensions: (nproma, nblks_v, 6) - std::vector<InType> coeff_int; // Dimensions: (nproma, 6, nblks_v) - std::vector<OutType> p_vert_out; // Dimensions: (nproma, nlev, nblks_v) - - Cells2vertsriScalarLibTestFixture() { - // Allocate and initialize inputs. - p_cell_in.resize(num_elements_3d<InType>(nproma, nlev, nblks_c), - static_cast<InType>(1)); - vert_cell_idx.resize(num_elements_3d<int>(nproma, nblks_v, 6), 1); - vert_cell_blk.resize(num_elements_3d<int>(nproma, nblks_v, 6), 0); - coeff_int.resize(num_elements_3d<InType>(nproma, 6, nblks_v), - static_cast<InType>(1)); - - // Allocate output arrays and initialize to zero. - p_vert_out.resize(num_elements_3d<OutType>(nproma, nlev, nblks_v), - static_cast<OutType>(0)); - } + Kokkos::View<InType*, memory_space> p_cell_in; + Kokkos::View<int*, memory_space> vert_cell_idx; + Kokkos::View<int*, memory_space> vert_cell_blk; + Kokkos::View<InType*, memory_space> coeff_int; + Kokkos::View<OutType*, memory_space> p_vert_out; + + Cells2vertsriScalarLibTestFixture() + : p_cell_in("p_cell_in", nproma * nlev * nblks_c), + vert_cell_idx("vert_cell_idx", nproma * nblks_v * 6), + vert_cell_blk("vert_cell_blk", nproma * nblks_v * 6), + coeff_int("coeff_int", nproma * 6 * nblks_v), + p_vert_out("p_vert_out", nproma * nlev * nblks_v) + {} }; // Add test suite TYPED_TEST_SUITE(Cells2vertsriScalarLibTestFixture, MixedTypes); // Add test -TYPED_TEST(Cells2vertsriScalarLibTestFixture, Cells2VertsRI) { +TYPED_TEST(Cells2vertsriScalarLibTestFixture, Cells2VertsRISpecific) { using InType = typename TestFixture::InType; using OutType = typename TestFixture::OutType; + constexpr int nproma = this->nproma; + constexpr int nlev = this->nlev; + constexpr int nblks_c = this->nblks_c; + constexpr int nblks_v = this->nblks_v; + + // Define indexing helpers + const auto &cell_at = at<nproma, nlev, nblks_c>; + const auto &idx_at = at<nproma, nblks_v, 6>; + const auto &blk_at = at<nproma, nblks_v, 6>; + const auto &coeff_at = at<nproma, 6, nblks_v>; + + // For output, we need to handle different layouts depending on __LOOP_EXCHANGE + // This is a special case for this function +#ifdef __LOOP_EXCHANGE + const auto &vert_at = at<nproma, nlev, nblks_c>; // jv, jk, jb order +#else + const auto &vert_at = at<nlev, nproma, nblks_c>; // jk, jv, jb order +#endif + + // Create host mirror views + auto p_cell_in_h = Kokkos::create_mirror_view(this->p_cell_in); + auto vert_cell_idx_h = Kokkos::create_mirror_view(this->vert_cell_idx); + auto vert_cell_blk_h = Kokkos::create_mirror_view(this->vert_cell_blk); + auto coeff_int_h = Kokkos::create_mirror_view(this->coeff_int); + auto p_vert_out_h = Kokkos::create_mirror_view(this->p_vert_out); + + // Initialize with index-based test values + for (int ib = 0; ib < nblks_c; ++ib) { + for (int ik = 0; ik < nlev; ++ik) { + for (int ic = 0; ic < nproma; ++ic) { + // Value depends on all three indices + p_cell_in_h[cell_at(ic, ik, ib)] = static_cast<InType>(1.0 + ic + ik * 0.1 + ib * 0.01); + } + } + } + + // Initialize vertex connectivity indices with specific pattern + for (int ib = 0; ib < nblks_v; ++ib) { + for (int ic = 0; ic < nproma; ++ic) { + // Each vertex is connected to 6 cells + for (int j = 0; j < 6; ++j) { + // Cell indices with a pattern + vert_cell_idx_h[idx_at(ic, ib, j)] = (ic + j) % nproma; + vert_cell_blk_h[blk_at(ic, ib, j)] = (ib + j % nblks_c) % nblks_c; + + // Interpolation coefficients that depend on indices + coeff_int_h[coeff_at(ic, j, ib)] = static_cast<OutType>(1.0 / 6.0 + j * 0.01); + } + + // Initialize output to zero + for (int ik = 0; ik < nlev; ++ik) { + // Handle different indexing depending on __LOOP_EXCHANGE +#ifdef __LOOP_EXCHANGE + p_vert_out_h[vert_at(ic, ik, ib)] = static_cast<OutType>(0.0); +#else + p_vert_out_h[vert_at(ik, ic, ib)] = static_cast<OutType>(0.0); +#endif + } + } + } + + // Copy to device + Kokkos::deep_copy(this->p_cell_in, p_cell_in_h); + Kokkos::deep_copy(this->vert_cell_idx, vert_cell_idx_h); + Kokkos::deep_copy(this->vert_cell_blk, vert_cell_blk_h); + Kokkos::deep_copy(this->coeff_int, coeff_int_h); + Kokkos::deep_copy(this->p_vert_out, p_vert_out_h); + // Call the function cells2verts_scalar_ri_lib<InType, OutType>( this->p_cell_in.data(), this->vert_cell_idx.data(), @@ -507,25 +1793,180 @@ TYPED_TEST(Cells2vertsriScalarLibTestFixture, Cells2VertsRI) { this->i_startidx, this->i_endidx, this->slev, this->elev, this->nproma, this->nlev, this->nblks_c, this->nblks_v, this->lacc, this->acc_async); - // Check the outputs only for blocks in the range - // { [i_startblk, i_endblk], [slev,elev], [i_startidx_in, i_endidx_in] } - for (int block = this->i_startblk; block <= this->i_endblk; ++block) { - for (int level = this->slev; level < this->elev; ++level) { - for (int i = this->i_startidx; i < this->i_endidx; ++i) { - // Compute the linear index for a 3D array in column-major order: + // Copy results back to host + Kokkos::deep_copy(p_vert_out_h, this->p_vert_out); + + // Expected results based on the specific test values + std::vector<OutType> expected_verts = { #ifdef __LOOP_EXCHANGE - size_t idx = - i + level * this->nproma + block * this->nproma * this->nlev; + 1.7459, 1.7159, 1.8609, 1.8309, 1.9759, 1.9459, + 1.7456, 1.7156, 1.8606, 1.8306, 1.9756, 1.9456 #else - size_t idx = level + i * this->nlev + block * this->nproma * this->nlev; + 1.7459, 1.8609, 1.9759, 1.7159, 1.8309, 1.9459, + 1.7456, 1.8606, 1.9756, 1.7156, 1.8306, 1.9456 +#endif + }; + + // Verify results - using the appropriate indexing depending on __LOOP_EXCHANGE + for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) { + int i_startidx, i_endidx; + get_indices_v_lib(this->i_startidx, this->i_endidx, nproma, jb, + this->i_startblk, this->i_endblk, i_startidx, i_endidx); + + for (int jk = this->slev; jk <= this->elev; ++jk) { + for (int jv = i_startidx; jv <= i_endidx; ++jv) { +#ifdef __LOOP_EXCHANGE + EXPECT_NEAR(p_vert_out_h[vert_at(jv, jk, jb)], + expected_verts[vert_at(jv, jk, jb)], + static_cast<OutType>(1e-5)) + << "Failure at block " << jb << ", level " << jk << ", index " << jv; +#else + EXPECT_NEAR(p_vert_out_h[vert_at(jk, jv, jb)], + expected_verts[vert_at(jk, jv, jb)], + static_cast<OutType>(1e-5)) + << "Failure at block " << jb << ", level " << jk << ", index " << jv; +#endif + } + } + } +} + +TYPED_TEST(Cells2vertsriScalarLibTestFixture, Cells2VertsRIRandom) { + using InType = typename TestFixture::InType; + using OutType = typename TestFixture::OutType; + + constexpr int nproma = this->nproma; + constexpr int nlev = this->nlev; + constexpr int nblks_c = this->nblks_c; + constexpr int nblks_v = this->nblks_v; + + // Define indexing helpers + const auto &cell_at = at<nproma, nlev, nblks_c>; + const auto &idx_at = at<nproma, nblks_v, 6>; + const auto &blk_at = at<nproma, nblks_v, 6>; + const auto &coeff_at = at<nproma, 6, nblks_v>; + + // For output, we need to handle different layouts depending on __LOOP_EXCHANGE +#ifdef __LOOP_EXCHANGE + const auto &vert_at = at<nproma, nlev, nblks_v>; // jv, jk, jb order +#else + const auto &vert_at = at<nlev, nproma, nblks_v>; // jk, jv, jb order +#endif + + // Create host mirror views + auto p_cell_in_h = Kokkos::create_mirror_view(this->p_cell_in); + auto vert_cell_idx_h = Kokkos::create_mirror_view(this->vert_cell_idx); + auto vert_cell_blk_h = Kokkos::create_mirror_view(this->vert_cell_blk); + auto coeff_int_h = Kokkos::create_mirror_view(this->coeff_int); + auto p_vert_out_h = Kokkos::create_mirror_view(this->p_vert_out); + + // Use fixed seed for reproducibility + std::mt19937 gen(42); + std::uniform_int_distribution<int> cell_distrib(0, nproma - 1); + std::uniform_int_distribution<int> block_distrib(0, nblks_c - 1); + std::uniform_real_distribution<double> real_distrib(0.01, 1.0); + std::uniform_real_distribution<double> coeff_distrib(0.01, 0.3); // Keep coefficients reasonable + + // Initialize with random values + for (int ib = 0; ib < nblks_c; ++ib) { + for (int ik = 0; ik < nlev; ++ik) { + for (int ic = 0; ic < nproma; ++ic) { + p_cell_in_h[cell_at(ic, ik, ib)] = static_cast<InType>(real_distrib(gen)); + } + } + } + + // Initialize vertex connectivity indices with random values + for (int ib = 0; ib < nblks_v; ++ib) { + for (int ic = 0; ic < nproma; ++ic) { + // Each vertex is connected to 6 cells + for (int j = 0; j < 6; ++j) { + vert_cell_idx_h[idx_at(ic, ib, j)] = cell_distrib(gen); + vert_cell_blk_h[blk_at(ic, ib, j)] = block_distrib(gen); + + // Normalized coefficients + coeff_int_h[coeff_at(ic, j, ib)] = static_cast<InType>(coeff_distrib(gen)); + } + + // Initialize output to zero + for (int ik = 0; ik < nlev; ++ik) { + // Handle different indexing depending on __LOOP_EXCHANGE +#ifdef __LOOP_EXCHANGE + p_vert_out_h[vert_at(ic, ik, ib)] = static_cast<OutType>(0.0); +#else + p_vert_out_h[vert_at(ik, ic, ib)] = static_cast<OutType>(0.0); +#endif + } + } + } + + // Copy to device + Kokkos::deep_copy(this->p_cell_in, p_cell_in_h); + Kokkos::deep_copy(this->vert_cell_idx, vert_cell_idx_h); + Kokkos::deep_copy(this->vert_cell_blk, vert_cell_blk_h); + Kokkos::deep_copy(this->coeff_int, coeff_int_h); + Kokkos::deep_copy(this->p_vert_out, p_vert_out_h); + + // Call the function + cells2verts_scalar_ri_lib<InType, OutType>( + this->p_cell_in.data(), this->vert_cell_idx.data(), + this->vert_cell_blk.data(), this->coeff_int.data(), + this->p_vert_out.data(), this->i_startblk, this->i_endblk, + this->i_startidx, this->i_endidx, this->slev, this->elev, + nproma, nlev, nblks_c, nblks_v, this->lacc, this->acc_async); + + // Copy results back to host + Kokkos::deep_copy(p_vert_out_h, this->p_vert_out); + + // Prepare expected results storage + using host_space = Kokkos::HostSpace; + Kokkos::View<OutType***, host_space> expected_verts("expected_verts", nproma, nlev, nblks_v); + + // Compute expected values + for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) { + // Get the actual indices to process for this block + int i_startidx, i_endidx; + get_indices_v_lib(this->i_startidx, this->i_endidx, nproma, jb, + this->i_startblk, this->i_endblk, i_startidx, i_endidx); + + for (int jk = this->slev; jk <= this->elev; ++jk) { + for (int jv = i_startidx; jv <= i_endidx; ++jv) { + expected_verts(jv, jk, jb) = static_cast<OutType>(0.0); + + for (int j = 0; j < 6; ++j) { + int cell_idx = vert_cell_idx_h[idx_at(jv, jb, j)]; + int cell_blk = vert_cell_blk_h[blk_at(jv, jb, j)]; + InType coeff = coeff_int_h[coeff_at(jv, j, jb)]; + + expected_verts(jv, jk, jb) += + static_cast<OutType>(coeff * p_cell_in_h[cell_at(cell_idx, jk, cell_blk)]); + } + } + } + } + + OutType tol = std::is_same<OutType, float>::value ? + static_cast<OutType>(1e-5) : static_cast<OutType>(1e-13); + + // Verify results - using the appropriate indexing depending on __LOOP_EXCHANGE + for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) { + int i_startidx, i_endidx; + get_indices_v_lib(this->i_startidx, this->i_endidx, nproma, jb, + this->i_startblk, this->i_endblk, i_startidx, i_endidx); + + for (int jk = this->slev; jk <= this->elev; ++jk) { + for (int jv = i_startidx; jv <= i_endidx; ++jv) { +#ifdef __LOOP_EXCHANGE + EXPECT_NEAR(p_vert_out_h[vert_at(jv, jk, jb)], + expected_verts(jv, jk, jb), tol) + << "Failure at block " << jb << ", level " << jk << ", index " << jv; +#else + EXPECT_NEAR(p_vert_out_h[vert_at(jk, jv, jb)], + expected_verts(jv, jk, jb), tol) + << "Failure at block " << jb << ", level " << jk << ", index " << jv; #endif - // Since every contribution is 1 and there are 6 stencil points, - // expect 6. - EXPECT_NEAR(this->p_vert_out[idx], static_cast<OutType>(6), - static_cast<OutType>(1e-5)) - << "Failure at block " << block << ", level " << level << ", index " - << i; } } } } + diff --git a/test/c/test_interpolation_vector.cpp b/test/c/test_interpolation_vector.cpp index 680fb6e5ac669549b7e96f3fd5c94ba7a69edd3e..0806e3575f1d4eb12c4b0f9cc10a4b51497aa30e 100644 --- a/test/c/test_interpolation_vector.cpp +++ b/test/c/test_interpolation_vector.cpp @@ -12,104 +12,375 @@ #include <Kokkos_Core.hpp> #include <gtest/gtest.h> #include <vector> +#include <random> #include "mo_lib_interpolation_vector.hpp" +#include "dim_helper.hpp" -// Dimensions for the test (small, trivial test). -// We assume Fortran ordering: column-major, but our C wrappers will wrap raw -// pointers into Kokkos::Views with LayoutLeft. -constexpr int nproma = 2; -constexpr int nlev = 3; -constexpr int nblks_e = 2; // For the edge arrays (p_vn_in, p_vt_in) -constexpr int nblks_c = 2; // For the cell arrays and interpolation coefficients - -// For the get_indices_c_lib inputs. -constexpr int i_startblk = 0; -constexpr int i_endblk = 1; // two blocks: indices 0 and 1 -constexpr int i_startidx_in = 0; -constexpr int i_endidx_in = nproma - 1; // 0 and 1 -constexpr int slev = 0; -constexpr int elev = nlev - 1; // 0 .. 2 - -// Helper to compute total number of elements for a 3D array stored in -// column-major order. -template <typename T> size_t num_elements(int dim1, int dim2, int dim3) { - return static_cast<size_t>(dim1) * dim2 * dim3; -} +/// Base test class for the edges2cells tests. Templated for the ValueType. +template <typename ValueType> +class InterpolationVectorTest : public ::testing::Test { +protected: + // Constant dimensions + static constexpr int nproma = 2; // inner loop length + static constexpr int nlev = 3; // number of vertical levels + static constexpr int nblks_e = 2; // number of edge blocks + static constexpr int nblks_c = 2; // number of cell blocks + static constexpr int num_edges = 3; // number of edges per cell + + // Parameter values + int i_startblk = 0; + int i_endblk = nblks_c - 1; // Test blocks [0 .. nblks_c-1] + int i_startidx_in = 0; + int i_endidx_in = nproma - 1; // Full range: 0 .. nproma-1 + int slev = 0; + int elev = nlev - 1; // Full vertical range (0 .. nlev-1) + + // Using Kokkos execution and memory spaces + using exec_space = Kokkos::DefaultExecutionSpace; + using memory_space = exec_space::memory_space; + + // Kokkos Views for test data + Kokkos::View<ValueType*, memory_space> p_vn_in; + Kokkos::View<ValueType*, memory_space> p_vt_in; + Kokkos::View<int*, memory_space> cell_edge_idx; + Kokkos::View<int*, memory_space> cell_edge_blk; + Kokkos::View<ValueType*, memory_space> e_bln_c_u; + Kokkos::View<ValueType*, memory_space> e_bln_c_v; + Kokkos::View<ValueType*, memory_space> p_u_out; + Kokkos::View<ValueType*, memory_space> p_v_out; + + InterpolationVectorTest() + : p_vn_in("p_vn_in", dim_combine(nproma, nlev, nblks_e)), + p_vt_in("p_vt_in", dim_combine(nproma, nlev, nblks_e)), + cell_edge_idx("cell_edge_idx", dim_combine(nproma, nblks_c, num_edges)), + cell_edge_blk("cell_edge_blk", dim_combine(nproma, nblks_c, num_edges)), + e_bln_c_u("e_bln_c_u", dim_combine(nproma, 6, nblks_c)), + e_bln_c_v("e_bln_c_v", dim_combine(nproma, 6, nblks_c)), + p_u_out("p_u_out", dim_combine(nproma, nlev, nblks_c)), + p_v_out("p_v_out", dim_combine(nproma, nlev, nblks_c)) + {} +}; + +/// ValueTypes to test with +typedef ::testing::Types<float, double> ValueTypes; + +TYPED_TEST_SUITE(InterpolationVectorTest, ValueTypes); -// Test for the double precision (dp) version. -TEST(Edges2CellsTest, DPTest) { - // Allocate and fill input arrays. - std::vector<double> p_vn_in(num_elements<double>(nproma, nlev, nblks_e), 1.0); - std::vector<double> p_vt_in(num_elements<double>(nproma, nlev, nblks_e), 1.0); - // cell_edge_idx and cell_edge_blk: dimensions [nproma, nblks_c, 3] - std::vector<int> cell_edge_idx(num_elements<int>(nproma, nblks_c, 3), 1); - std::vector<int> cell_edge_blk(num_elements<int>(nproma, nblks_c, 3), 1); - - // Here we set cell_edge_idx to 1, 2, 1 for every triple. - for (int i = 0; i < num_elements<int>(nproma, nblks_c, 3); i += 3) { - cell_edge_idx[i] = 1; - cell_edge_idx[i + 1] = 2; - cell_edge_idx[i + 2] = 1; +TYPED_TEST(InterpolationVectorTest, Edges2CellsSpecific) { + constexpr int nproma = this->nproma; + constexpr int nlev = this->nlev; + constexpr int nblks_e = this->nblks_e; + constexpr int nblks_c = this->nblks_c; + constexpr int num_edges = this->num_edges; + + // Define indexing helpers + const auto &vn_at = at<nproma, nlev, nblks_e>; + const auto &vt_at = at<nproma, nlev, nblks_e>; + const auto &edge_idx_at = at<nproma, nblks_c, num_edges>; + const auto &edge_blk_at = at<nproma, nblks_c, num_edges>; + const auto &bln_at = at<nproma, 6, nblks_c>; + const auto &out_at = at<nproma, nlev, nblks_c>; + + // Create host mirror views + auto p_vn_in_h = Kokkos::create_mirror_view(this->p_vn_in); + auto p_vt_in_h = Kokkos::create_mirror_view(this->p_vt_in); + auto cell_edge_idx_h = Kokkos::create_mirror_view(this->cell_edge_idx); + auto cell_edge_blk_h = Kokkos::create_mirror_view(this->cell_edge_blk); + auto e_bln_c_u_h = Kokkos::create_mirror_view(this->e_bln_c_u); + auto e_bln_c_v_h = Kokkos::create_mirror_view(this->e_bln_c_v); + auto p_u_out_h = Kokkos::create_mirror_view(this->p_u_out); + auto p_v_out_h = Kokkos::create_mirror_view(this->p_v_out); + + // Initialize with simple values + for (int ib = 0; ib < nblks_e; ++ib) { + for (int ik = 0; ik < nlev; ++ik) { + for (int ic = 0; ic < nproma; ++ic) { + p_vn_in_h[vn_at(ic, ik, ib)] = static_cast<TypeParam>(1.0 + ic + ik + ib); + p_vt_in_h[vt_at(ic, ik, ib)] = static_cast<TypeParam>(2.0 + ic + ik + ib); + } + } } - // Similarly, set cell_edge_blk to all ones (valid since nblks_e=2, so index 1 - // means block 0 after subtracting 1). e_bln_c_u and e_bln_c_v: dimensions - // [nproma, 6, nblks_c] - std::vector<double> e_bln_c_u(num_elements<double>(nproma, 6, nblks_c), 1.0); - std::vector<double> e_bln_c_v(num_elements<double>(nproma, 6, nblks_c), 1.0); - // Output arrays: dimensions [nproma, nlev, nblks_c] - std::vector<double> p_u_out(num_elements<double>(nproma, nlev, nblks_c), 0.0); - std::vector<double> p_v_out(num_elements<double>(nproma, nlev, nblks_c), 0.0); - - std::vector<double> p_u_ref(num_elements<double>(nproma, nlev, nblks_c), 6.0); - std::vector<double> p_v_ref(num_elements<double>(nproma, nlev, nblks_c), 6.0); - - // Call the dp (double precision) version. - edges2cells_vector_lib<double>( - p_vn_in.data(), p_vt_in.data(), cell_edge_idx.data(), - cell_edge_blk.data(), e_bln_c_u.data(), e_bln_c_v.data(), p_u_out.data(), - p_v_out.data(), i_startblk, i_endblk, i_startidx_in, i_endidx_in, slev, - elev, nproma, nlev, nblks_e, nblks_c); - - // Check that for each computed cell in p_u_out and p_v_out, the value is 6. - // This is because for each cell, the kernel adds 6 terms of 1*1. - for (size_t idx = 0; idx < p_u_out.size(); ++idx) { - EXPECT_NEAR(p_u_out[idx], p_u_ref[idx], 1e-12); - EXPECT_NEAR(p_v_out[idx], p_v_ref[idx], 1e-12); + + // Set each cell to connect to 3 edges + for (int ib = 0; ib < nblks_c; ++ib) { + for (int ic = 0; ic < nproma; ++ic) { + // Edge indices are 1-indexed in the function + cell_edge_idx_h[edge_idx_at(ic, ib, 0)] = 1; + cell_edge_idx_h[edge_idx_at(ic, ib, 1)] = 2; + cell_edge_idx_h[edge_idx_at(ic, ib, 2)] = 3; + + // Edge blocks are 1-indexed in the function + cell_edge_blk_h[edge_blk_at(ic, ib, 0)] = 1; + cell_edge_blk_h[edge_blk_at(ic, ib, 1)] = 1; + cell_edge_blk_h[edge_blk_at(ic, ib, 2)] = 1; + + // Initialize bilinear coefficients + for (int j = 0; j < 6; ++j) { + e_bln_c_u_h[bln_at(ic, j, ib)] = static_cast<TypeParam>(0.1 * (j + 1)); + e_bln_c_v_h[bln_at(ic, j, ib)] = static_cast<TypeParam>(0.05 * (j + 1)); + } + + // Initialize output to zero + for (int ik = 0; ik < nlev; ++ik) { + p_u_out_h[out_at(ic, ik, ib)] = static_cast<TypeParam>(0.0); + p_v_out_h[out_at(ic, ik, ib)] = static_cast<TypeParam>(0.0); + } + } + } + + // Copy to device + Kokkos::deep_copy(this->p_vn_in, p_vn_in_h); + Kokkos::deep_copy(this->p_vt_in, p_vt_in_h); + Kokkos::deep_copy(this->cell_edge_idx, cell_edge_idx_h); + Kokkos::deep_copy(this->cell_edge_blk, cell_edge_blk_h); + Kokkos::deep_copy(this->e_bln_c_u, e_bln_c_u_h); + Kokkos::deep_copy(this->e_bln_c_v, e_bln_c_v_h); + Kokkos::deep_copy(this->p_u_out, p_u_out_h); + Kokkos::deep_copy(this->p_v_out, p_v_out_h); + + // Call the function + edges2cells_vector_lib<TypeParam>( + this->p_vn_in.data(), this->p_vt_in.data(), + this->cell_edge_idx.data(), this->cell_edge_blk.data(), + this->e_bln_c_u.data(), this->e_bln_c_v.data(), + this->p_u_out.data(), this->p_v_out.data(), + this->i_startblk, this->i_endblk, this->i_startidx_in, this->i_endidx_in, + this->slev, this->elev, nproma, nlev, nblks_e, nblks_c); + + // Copy results back to host + Kokkos::deep_copy(p_u_out_h, this->p_u_out); + Kokkos::deep_copy(p_v_out_h, this->p_v_out); + + // Compute expected results on host + using host_space = Kokkos::HostSpace; + Kokkos::View<TypeParam**, host_space> expected_u("expected_u", nproma, nlev); + Kokkos::View<TypeParam**, host_space> expected_v("expected_v", nproma, nlev); + + for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) { + int i_startidx, i_endidx; + get_indices_c_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb, + this->i_startblk, this->i_endblk, i_startidx, i_endidx); + + + for (int jk = this->slev; jk <= this->elev; ++jk) { + for (int jc = i_startidx; jc <= i_endidx; ++jc) { + // Compute expected values + expected_u(jc, jk) = + e_bln_c_u_h[bln_at(jc, 0, jb)] * + p_vn_in_h[vn_at(cell_edge_idx_h[edge_idx_at(jc, jb, 0)] - 1, jk, + cell_edge_blk_h[edge_blk_at(jc, jb, 0)] - 1)] + + e_bln_c_u_h[bln_at(jc, 1, jb)] * + p_vt_in_h[vt_at(cell_edge_idx_h[edge_idx_at(jc, jb, 0)] - 1, jk, + cell_edge_blk_h[edge_blk_at(jc, jb, 0)] - 1)] + + e_bln_c_u_h[bln_at(jc, 2, jb)] * + p_vn_in_h[vn_at(cell_edge_idx_h[edge_idx_at(jc, jb, 1)] - 1, jk, + cell_edge_blk_h[edge_blk_at(jc, jb, 1)] - 1)] + + e_bln_c_u_h[bln_at(jc, 3, jb)] * + p_vt_in_h[vt_at(cell_edge_idx_h[edge_idx_at(jc, jb, 1)] - 1, jk, + cell_edge_blk_h[edge_blk_at(jc, jb, 1)] - 1)] + + e_bln_c_u_h[bln_at(jc, 4, jb)] * + p_vn_in_h[vn_at(cell_edge_idx_h[edge_idx_at(jc, jb, 2)] - 1, jk, + cell_edge_blk_h[edge_blk_at(jc, jb, 2)] - 1)] + + e_bln_c_u_h[bln_at(jc, 5, jb)] * + p_vt_in_h[vt_at(cell_edge_idx_h[edge_idx_at(jc, jb, 2)] - 1, jk, + cell_edge_blk_h[edge_blk_at(jc, jb, 2)] - 1)]; + + expected_v(jc, jk) = + e_bln_c_v_h[bln_at(jc, 0, jb)] * + p_vn_in_h[vn_at(cell_edge_idx_h[edge_idx_at(jc, jb, 0)] - 1, jk, + cell_edge_blk_h[edge_blk_at(jc, jb, 0)] - 1)] + + e_bln_c_v_h[bln_at(jc, 1, jb)] * + p_vt_in_h[vt_at(cell_edge_idx_h[edge_idx_at(jc, jb, 0)] - 1, jk, + cell_edge_blk_h[edge_blk_at(jc, jb, 0)] - 1)] + + e_bln_c_v_h[bln_at(jc, 2, jb)] * + p_vn_in_h[vn_at(cell_edge_idx_h[edge_idx_at(jc, jb, 1)] - 1, jk, + cell_edge_blk_h[edge_blk_at(jc, jb, 1)] - 1)] + + e_bln_c_v_h[bln_at(jc, 3, jb)] * + p_vt_in_h[vt_at(cell_edge_idx_h[edge_idx_at(jc, jb, 1)] - 1, jk, + cell_edge_blk_h[edge_blk_at(jc, jb, 1)] - 1)] + + e_bln_c_v_h[bln_at(jc, 4, jb)] * + p_vn_in_h[vn_at(cell_edge_idx_h[edge_idx_at(jc, jb, 2)] - 1, jk, + cell_edge_blk_h[edge_blk_at(jc, jb, 2)] - 1)] + + e_bln_c_v_h[bln_at(jc, 5, jb)] * + p_vt_in_h[vt_at(cell_edge_idx_h[edge_idx_at(jc, jb, 2)] - 1, jk, + cell_edge_blk_h[edge_blk_at(jc, jb, 2)] - 1)]; + } + } + } + + // Verify results + for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) { + for (int jk = this->slev; jk <= this->elev; ++jk) { + for (int jc = this->i_startidx_in; jc <= this->i_endidx_in; ++jc) { + EXPECT_NEAR(p_u_out_h[out_at(jc, jk, jb)], expected_u(jc, jk), 1e-5) + << "u value mismatch at jc=" << jc << ", jk=" << jk; + EXPECT_NEAR(p_v_out_h[out_at(jc, jk, jb)], expected_v(jc, jk), 1e-5) + << "v value mismatch at jc=" << jc << ", jk=" << jk; + } + } } } -// Test for the single precision (sp) version. -TEST(Edges2CellsTest, SPTest) { - // Allocate and fill input arrays. - std::vector<float> p_vn_in(num_elements<float>(nproma, nlev, nblks_e), 1.0f); - std::vector<float> p_vt_in(num_elements<float>(nproma, nlev, nblks_e), 1.0f); - std::vector<int> cell_edge_idx(num_elements<int>(nproma, nblks_c, 3), 1); - std::vector<int> cell_edge_blk(num_elements<int>(nproma, nblks_c, 3), 1); - // Set cell_edge_idx values to 1, 2, 1. - for (int i = 0; i < num_elements<int>(nproma, nblks_c, 3); i += 3) { - cell_edge_idx[i] = 1; - cell_edge_idx[i + 1] = 2; - cell_edge_idx[i + 2] = 1; +TYPED_TEST(InterpolationVectorTest, Edges2CellsRandom) { + constexpr int nproma = this->nproma; + constexpr int nlev = this->nlev; + constexpr int nblks_e = this->nblks_e; + constexpr int nblks_c = this->nblks_c; + constexpr int num_edges = this->num_edges; + + // Define indexing helpers + const auto &vn_at = at<nproma, nlev, nblks_e>; + const auto &vt_at = at<nproma, nlev, nblks_e>; + const auto &edge_idx_at = at<nproma, nblks_c, num_edges>; + const auto &edge_blk_at = at<nproma, nblks_c, num_edges>; + const auto &bln_at = at<nproma, 6, nblks_c>; + const auto &out_at = at<nproma, nlev, nblks_c>; + + // Create host mirror views + auto p_vn_in_h = Kokkos::create_mirror_view(this->p_vn_in); + auto p_vt_in_h = Kokkos::create_mirror_view(this->p_vt_in); + auto cell_edge_idx_h = Kokkos::create_mirror_view(this->cell_edge_idx); + auto cell_edge_blk_h = Kokkos::create_mirror_view(this->cell_edge_blk); + auto e_bln_c_u_h = Kokkos::create_mirror_view(this->e_bln_c_u); + auto e_bln_c_v_h = Kokkos::create_mirror_view(this->e_bln_c_v); + auto p_u_out_h = Kokkos::create_mirror_view(this->p_u_out); + auto p_v_out_h = Kokkos::create_mirror_view(this->p_v_out); + + // Use fixed seed for reproducibility + std::mt19937 gen(42); + std::uniform_int_distribution<int> edge_distrib(1, num_edges); + std::uniform_int_distribution<int> block_distrib(1, nblks_e); + std::uniform_real_distribution<TypeParam> real_distrib(0.01, 1.0); + + // Initialize with random values + for (int ib = 0; ib < nblks_e; ++ib) { + for (int ik = 0; ik < nlev; ++ik) { + for (int ic = 0; ic < nproma; ++ic) { + p_vn_in_h[vn_at(ic, ik, ib)] = real_distrib(gen); + p_vt_in_h[vt_at(ic, ik, ib)] = real_distrib(gen); + } + } + } + + // // Set each cell to connect to random edges + for (int ib = 0; ib < nblks_c; ++ib) { + for (int ic = 0; ic < nproma; ++ic) { + // Edge indices are 1-indexed in the function + cell_edge_idx_h[edge_idx_at(ic, ib, 0)] = edge_distrib(gen); + cell_edge_idx_h[edge_idx_at(ic, ib, 1)] = edge_distrib(gen); + cell_edge_idx_h[edge_idx_at(ic, ib, 2)] = edge_distrib(gen); + + // Edge blocks are 1-indexed in the function + cell_edge_blk_h[edge_blk_at(ic, ib, 0)] = block_distrib(gen); + cell_edge_blk_h[edge_blk_at(ic, ib, 1)] = block_distrib(gen); + cell_edge_blk_h[edge_blk_at(ic, ib, 2)] = block_distrib(gen); + + // Initialize random bilinear coefficients + for (int j = 0; j < 6; ++j) { + e_bln_c_u_h[bln_at(ic, j, ib)] = real_distrib(gen); + e_bln_c_v_h[bln_at(ic, j, ib)] = real_distrib(gen); + } + + // Initialize output to zero + for (int ik = 0; ik < nlev; ++ik) { + p_u_out_h[out_at(ic, ik, ib)] = static_cast<TypeParam>(0.0); + p_v_out_h[out_at(ic, ik, ib)] = static_cast<TypeParam>(0.0); + } + } + } + + // Copy to device + Kokkos::deep_copy(this->p_vn_in, p_vn_in_h); + Kokkos::deep_copy(this->p_vt_in, p_vt_in_h); + Kokkos::deep_copy(this->cell_edge_idx, cell_edge_idx_h); + Kokkos::deep_copy(this->cell_edge_blk, cell_edge_blk_h); + Kokkos::deep_copy(this->e_bln_c_u, e_bln_c_u_h); + Kokkos::deep_copy(this->e_bln_c_v, e_bln_c_v_h); + Kokkos::deep_copy(this->p_u_out, p_u_out_h); + Kokkos::deep_copy(this->p_v_out, p_v_out_h); + + // Call the function + edges2cells_vector_lib<TypeParam>( + this->p_vn_in.data(), this->p_vt_in.data(), + this->cell_edge_idx.data(), this->cell_edge_blk.data(), + this->e_bln_c_u.data(), this->e_bln_c_v.data(), + this->p_u_out.data(), this->p_v_out.data(), + this->i_startblk, this->i_endblk, this->i_startidx_in, this->i_endidx_in, + this->slev, this->elev, nproma, nlev, nblks_e, nblks_c); + + // Copy results back to host + Kokkos::deep_copy(p_u_out_h, this->p_u_out); + Kokkos::deep_copy(p_v_out_h, this->p_v_out); + + // Compute expected results on host + using host_space = Kokkos::HostSpace; + Kokkos::View<TypeParam***, host_space> expected_u("expected_u", nproma, nlev, nblks_c); + Kokkos::View<TypeParam***, host_space> expected_v("expected_v", nproma, nlev, nblks_c); + + for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) { + int i_startidx, i_endidx; + get_indices_c_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb, + this->i_startblk, this->i_endblk, i_startidx, i_endidx); + + for (int jk = this->slev; jk <= this->elev; ++jk) { + for (int jc = i_startidx; jc <= i_endidx; ++jc) { + // Compute expected values + expected_u(jc, jk, jb) = + e_bln_c_u_h[bln_at(jc, 0, jb)] * + p_vn_in_h[vn_at(cell_edge_idx_h[edge_idx_at(jc, jb, 0)] - 1, jk, + cell_edge_blk_h[edge_blk_at(jc, jb, 0)] - 1)] + + e_bln_c_u_h[bln_at(jc, 1, jb)] * + p_vt_in_h[vt_at(cell_edge_idx_h[edge_idx_at(jc, jb, 0)] - 1, jk, + cell_edge_blk_h[edge_blk_at(jc, jb, 0)] - 1)] + + e_bln_c_u_h[bln_at(jc, 2, jb)] * + p_vn_in_h[vn_at(cell_edge_idx_h[edge_idx_at(jc, jb, 1)] - 1, jk, + cell_edge_blk_h[edge_blk_at(jc, jb, 1)] - 1)] + + e_bln_c_u_h[bln_at(jc, 3, jb)] * + p_vt_in_h[vt_at(cell_edge_idx_h[edge_idx_at(jc, jb, 1)] - 1, jk, + cell_edge_blk_h[edge_blk_at(jc, jb, 1)] - 1)] + + e_bln_c_u_h[bln_at(jc, 4, jb)] * + p_vn_in_h[vn_at(cell_edge_idx_h[edge_idx_at(jc, jb, 2)] - 1, jk, + cell_edge_blk_h[edge_blk_at(jc, jb, 2)] - 1)] + + e_bln_c_u_h[bln_at(jc, 5, jb)] * + p_vt_in_h[vt_at(cell_edge_idx_h[edge_idx_at(jc, jb, 2)] - 1, jk, + cell_edge_blk_h[edge_blk_at(jc, jb, 2)] - 1)]; + + expected_v(jc, jk, jb) = + e_bln_c_v_h[bln_at(jc, 0, jb)] * + p_vn_in_h[vn_at(cell_edge_idx_h[edge_idx_at(jc, jb, 0)] - 1, jk, + cell_edge_blk_h[edge_blk_at(jc, jb, 0)] - 1)] + + e_bln_c_v_h[bln_at(jc, 1, jb)] * + p_vt_in_h[vt_at(cell_edge_idx_h[edge_idx_at(jc, jb, 0)] - 1, jk, + cell_edge_blk_h[edge_blk_at(jc, jb, 0)] - 1)] + + e_bln_c_v_h[bln_at(jc, 2, jb)] * + p_vn_in_h[vn_at(cell_edge_idx_h[edge_idx_at(jc, jb, 1)] - 1, jk, + cell_edge_blk_h[edge_blk_at(jc, jb, 1)] - 1)] + + e_bln_c_v_h[bln_at(jc, 3, jb)] * + p_vt_in_h[vt_at(cell_edge_idx_h[edge_idx_at(jc, jb, 1)] - 1, jk, + cell_edge_blk_h[edge_blk_at(jc, jb, 1)] - 1)] + + e_bln_c_v_h[bln_at(jc, 4, jb)] * + p_vn_in_h[vn_at(cell_edge_idx_h[edge_idx_at(jc, jb, 2)] - 1, jk, + cell_edge_blk_h[edge_blk_at(jc, jb, 2)] - 1)] + + e_bln_c_v_h[bln_at(jc, 5, jb)] * + p_vt_in_h[vt_at(cell_edge_idx_h[edge_idx_at(jc, jb, 2)] - 1, jk, + cell_edge_blk_h[edge_blk_at(jc, jb, 2)] - 1)]; + } + } } - std::vector<float> e_bln_c_u(num_elements<float>(nproma, 6, nblks_c), 1.0f); - std::vector<float> e_bln_c_v(num_elements<float>(nproma, 6, nblks_c), 1.0f); - std::vector<float> p_u_out(num_elements<float>(nproma, nlev, nblks_c), 0.0f); - std::vector<float> p_v_out(num_elements<float>(nproma, nlev, nblks_c), 0.0f); - - std::vector<float> p_u_ref(num_elements<float>(nproma, nlev, nblks_c), 6.0f); - std::vector<float> p_v_ref(num_elements<float>(nproma, nlev, nblks_c), 6.0f); - - // Call the sp (float precision) version. - edges2cells_vector_lib<float>( - p_vn_in.data(), p_vt_in.data(), cell_edge_idx.data(), - cell_edge_blk.data(), e_bln_c_u.data(), e_bln_c_v.data(), p_u_out.data(), - p_v_out.data(), i_startblk, i_endblk, i_startidx_in, i_endidx_in, slev, - elev, nproma, nlev, nblks_e, nblks_c); - - // Verify that every computed output equals 6. - for (size_t idx = 0; idx < p_u_out.size(); ++idx) { - EXPECT_NEAR(p_u_out[idx], p_u_ref[idx], 1e-5f); - EXPECT_NEAR(p_v_out[idx], p_v_ref[idx], 1e-5f); + + TypeParam tol = std::is_same<TypeParam, float>::value ? + static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13); + + // Verify results + for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) { + for (int jk = this->slev; jk <= this->elev; ++jk) { + for (int jc = this->i_startidx_in; jc <= this->i_endidx_in; ++jc) { + EXPECT_NEAR(p_u_out_h[out_at(jc, jk, 0)], expected_u(jc, jk, 0), tol) + << "u value mismatch at jc=" << jc << ", jk=" << jk; + EXPECT_NEAR(p_v_out_h[out_at(jc, jk, 0)], expected_v(jc, jk, 0), tol) + << "v value mismatch at jc=" << jc << ", jk=" << jk; + } + } } } diff --git a/test/c/test_intp_rbf.cpp b/test/c/test_intp_rbf.cpp index 040d440223c683407a585de764511e5e2b384aea..a67480de0746d9564a5f3baf323e78fa1314ccfd 100644 --- a/test/c/test_intp_rbf.cpp +++ b/test/c/test_intp_rbf.cpp @@ -15,15 +15,9 @@ #include <gtest/gtest.h> #include <numeric> #include <vector> - -// Free-function helpers for 3D and 4D array sizes (assumed column-major) -template <typename T> size_t num_elements_3d(int d1, int d2, int d3) { - return static_cast<size_t>(d1) * d2 * d3; -} - -template <typename T> size_t num_elements_4d(int d1, int d2, int d3, int d4) { - return static_cast<size_t>(d1) * d2 * d3 * d4; -} +#include <random> +#include <iostream> +#include "dim_helper.hpp" // Define a helper struct that holds the two types. template <typename InT, typename OutT> struct MixedPrecision { @@ -31,6 +25,9 @@ template <typename InT, typename OutT> struct MixedPrecision { using out_type = OutT; }; +// Define the list of types we want to test. +typedef ::testing::Types<float, double> MyTypes; + // Define the list of type pairs we want to test. typedef ::testing::Types<MixedPrecision<double, double>, MixedPrecision<double, float>, @@ -48,6 +45,7 @@ public: static constexpr int rbf_c2grad_dim = 10; // fixed dimension static constexpr int rbf_vec_dim_c = 9; static constexpr int rbf_vec_dim_e = 4; + static constexpr int rbf_vec_dim_v = 6; // Fixed dimension for RBF // Parameter values. const int i_startblk = 0; @@ -60,244 +58,1103 @@ public: const bool acc_async = false; // No asynchronous execution. }; -// Define a typed test fixture for the functions which have the same input and -// output types template <typename T> -class RbfInterpolTypedTestFixture : public ::testing::Test, +class RbfInterpolSingleParamTest : public ::testing::Test, public interp_dimensions { public: - // Data arrays. - std::vector<T> p_cell_in; // size: nproma * nlev * nblks_c - std::vector<int> rbf_c2grad_idx; // size: rbf_c2grad_dim * nproma * nblks_c - std::vector<int> rbf_c2grad_blk; // size: rbf_c2grad_dim * nproma * nblks_c - std::vector<int> rbf_vec_idx_c; // size: rbf_vec_dim_c * nproma * nblks_c - std::vector<int> rbf_vec_blk_c; // size: rbf_vec_dim_c * nproma * nblks_c - std::vector<T> - rbf_c2grad_coeff; // size: rbf_c2grad_dim * 2 * nproma * nblks_c - std::vector<T> grad_x; // size: nproma * nlev * nblks_c - std::vector<T> grad_y; // size: nproma * nlev * nblks_c - std::vector<T> p_vn_in; - std::vector<T> rbf_vec_coeff_c; - std::vector<T> p_u_out; - std::vector<T> p_v_out; - - std::vector<int> rbf_vec_idx_e; - std::vector<int> rbf_vec_blk_e; - std::vector<T> rbf_vec_coeff_e; - std::vector<T> p_vt_out; - - RbfInterpolTypedTestFixture() { - size_t size3d = static_cast<size_t>(nproma) * nlev * nblks_c; - size_t size3d_idx = static_cast<size_t>(rbf_c2grad_dim) * nproma * nblks_c; - size_t size4d = static_cast<size_t>(rbf_c2grad_dim) * 2 * nproma * nblks_c; - - size_t size3d_vec_dim = - static_cast<size_t>(rbf_vec_dim_c) * nproma * nblks_c; - size_t size_4d_vec_dim = - static_cast<size_t>(rbf_vec_dim_c) * 2 * nproma * nblks_c; - - size_t size3d_edge_lib = - static_cast<size_t>(rbf_vec_dim_e) * nproma * nblks_c; - size_t size_4d_edge_lib = - static_cast<size_t>(rbf_vec_dim_e) * 2 * nproma * nblks_c; - - p_cell_in.resize(size3d, static_cast<T>(1)); - p_vn_in.resize(size3d, static_cast<T>(1)); - - rbf_vec_idx_c.resize(size3d_vec_dim, 1); - rbf_vec_blk_c.resize(size3d_vec_dim, 0); - rbf_c2grad_idx.resize(size3d_idx, 1); - rbf_c2grad_blk.resize(size3d_idx, 0); // Set block indices to 0 for testing. - rbf_vec_idx_e.resize(size3d_vec_dim, 1); - rbf_vec_blk_e.resize(size3d_vec_dim, 0); - - rbf_vec_coeff_c.resize(size_4d_vec_dim, static_cast<T>(1)); - rbf_c2grad_coeff.resize(size4d, static_cast<T>(1)); - rbf_vec_coeff_e.resize(size_4d_edge_lib, static_cast<T>(1)); - - p_u_out.resize(size3d_vec_dim, static_cast<T>(0)); - p_v_out.resize(size3d_vec_dim, static_cast<T>(0)); - p_vt_out.resize(size3d_edge_lib, static_cast<T>(0)); - - grad_x.resize(size3d, static_cast<T>(0)); - grad_y.resize(size3d, static_cast<T>(0)); - } + // Using Kokkos execution and memory spaces + using exec_space = Kokkos::DefaultExecutionSpace; + using memory_space = exec_space::memory_space; + + // Kokkos Views for test data + Kokkos::View<T*, memory_space> p_cell_in; // Dimensions: (nproma, nlev, nblks_c) + Kokkos::View<int*, memory_space> rbf_c2grad_idx; // Dimensions: (rbf_c2grad_dim, nproma, nblks_c) + Kokkos::View<int*, memory_space> rbf_c2grad_blk; // Dimensions: (rbf_c2grad_dim, nproma, nblks_c) + Kokkos::View<T*, memory_space> rbf_c2grad_coeff; // Dimensions: (rbf_c2grad_dim, 2, nproma, nblks_c) + Kokkos::View<T*, memory_space> grad_x; // Dimensions: (nproma, nlev, nblks_c) + Kokkos::View<T*, memory_space> grad_y; // Dimensions: (nproma, nlev, nblks_c) + + // Additional arrays for other functions + Kokkos::View<T*, memory_space> p_vn_in; + Kokkos::View<int*, memory_space> rbf_vec_idx_c; + Kokkos::View<int*, memory_space> rbf_vec_blk_c; + Kokkos::View<T*, memory_space> rbf_vec_coeff_c; + Kokkos::View<T*, memory_space> p_u_out; + Kokkos::View<T*, memory_space> p_v_out; + + Kokkos::View<int*, memory_space> rbf_vec_idx_e; + Kokkos::View<int*, memory_space> rbf_vec_blk_e; + Kokkos::View<T*, memory_space> rbf_vec_coeff_e; + Kokkos::View<T*, memory_space> p_vt_out; + + RbfInterpolSingleParamTest() + : p_cell_in("p_cell_in", nproma * nlev * nblks_c), + rbf_c2grad_idx("rbf_c2grad_idx", rbf_c2grad_dim * nproma * nblks_c), + rbf_c2grad_blk("rbf_c2grad_blk", rbf_c2grad_dim * nproma * nblks_c), + rbf_c2grad_coeff("rbf_c2grad_coeff", rbf_c2grad_dim * 2 * nproma * nblks_c), + grad_x("grad_x", nproma * nlev * nblks_c), + grad_y("grad_y", nproma * nlev * nblks_c), + + p_vn_in("p_vn_in", nproma * nlev * nblks_c), + rbf_vec_idx_c("rbf_vec_idx_c", rbf_vec_dim_c * nproma * nblks_c), + rbf_vec_blk_c("rbf_vec_blk_c", rbf_vec_dim_c * nproma * nblks_c), + rbf_vec_coeff_c("rbf_vec_coeff_c", rbf_vec_dim_c * 2 * nproma * nblks_c), + p_u_out("p_u_out", nproma * nlev * nblks_c), + p_v_out("p_v_out", nproma * nlev * nblks_c), + + rbf_vec_idx_e("rbf_vec_idx_e", rbf_vec_dim_e * nproma * nblks_c), + rbf_vec_blk_e("rbf_vec_blk_e", rbf_vec_dim_e * nproma * nblks_c), + rbf_vec_coeff_e("rbf_vec_coeff_e", rbf_vec_dim_e * 2 * nproma * nblks_c), + p_vt_out("p_vt_out", nproma * nlev * nblks_c) + {} }; -typedef ::testing::Types<float, double> MyTypes; +TYPED_TEST_SUITE(RbfInterpolSingleParamTest, MyTypes); + +//////////////////////////////////////////////////////////////////////////////// +// +// ! rbf_interpol_c2grad +// +//////////////////////////////////////////////////////////////////////////////// + +TYPED_TEST(RbfInterpolSingleParamTest, C2GradSpecific) { + constexpr int nproma = this->nproma; + constexpr int nlev = this->nlev; + constexpr int nblks_c = this->nblks_c; + constexpr int rbf_c2grad_dim = this->rbf_c2grad_dim; + + // Define indexing helpers + const auto &cell_at = at<nproma, nlev, nblks_c>; + const auto &idx_at = at<rbf_c2grad_dim, nproma, nblks_c>; + const auto &blk_at = at<rbf_c2grad_dim, nproma, nblks_c>; + const auto &coeff_at = at<rbf_c2grad_dim, 2, nproma, nblks_c>; + const auto &grad_at = at<nproma, nlev, nblks_c>; + + // Create host mirror views + auto p_cell_in_h = Kokkos::create_mirror_view(this->p_cell_in); + auto rbf_c2grad_idx_h = Kokkos::create_mirror_view(this->rbf_c2grad_idx); + auto rbf_c2grad_blk_h = Kokkos::create_mirror_view(this->rbf_c2grad_blk); + auto rbf_c2grad_coeff_h = Kokkos::create_mirror_view(this->rbf_c2grad_coeff); + auto grad_x_h = Kokkos::create_mirror_view(this->grad_x); + auto grad_y_h = Kokkos::create_mirror_view(this->grad_y); -TYPED_TEST_SUITE(RbfInterpolTypedTestFixture, MyTypes); + // Initialize with index-based pattern for cell data + for (int ib = 0; ib < nblks_c; ++ib) { + for (int ik = 0; ik < nlev; ++ik) { + for (int ic = 0; ic < nproma; ++ic) { + // Value depends on all three indices + p_cell_in_h[cell_at(ic, ik, ib)] = static_cast<TypeParam>(1.0 + ic + ik * 0.1 + ib * 0.01); + } + } + } + + // Initialize connectivity indices with specific pattern + for (int ib = 0; ib < nblks_c; ++ib) { + for (int ic = 0; ic < nproma; ++ic) { + // First index points to self + rbf_c2grad_idx_h[idx_at(0, ic, ib)] = ic; + rbf_c2grad_blk_h[blk_at(0, ic, ib)] = ib; + + // Other indices follow a pattern + for (int j = 1; j < rbf_c2grad_dim; ++j) { + rbf_c2grad_idx_h[idx_at(j, ic, ib)] = (ic + j) % nproma; + rbf_c2grad_blk_h[blk_at(j, ic, ib)] = (ib + j % nblks_c) % nblks_c; + } + + // Coefficients for x and y gradients - use a simple pattern that depends on ib, ic and j + for (int j = 0; j < rbf_c2grad_dim; ++j) { + rbf_c2grad_coeff_h[coeff_at(j, 0, ic, ib)] = static_cast<TypeParam>(1.0 + ib + 0.1*ic + 0.01 * j); // x coefficient + rbf_c2grad_coeff_h[coeff_at(j, 1, ic, ib)] = static_cast<TypeParam>(2.0 + ib + 0.1*ic + 0.01 * j); // y coefficient + } + } + } -TYPED_TEST(RbfInterpolTypedTestFixture, C2Grad) { - using T = TypeParam; + // Initialize gradients to zero + for (int ib = 0; ib < nblks_c; ++ib) { + for (int ik = 0; ik < nlev; ++ik) { + for (int ic = 0; ic < nproma; ++ic) { + grad_x_h[grad_at(ic, ik, ib)] = static_cast<TypeParam>(0.0); + grad_y_h[grad_at(ic, ik, ib)] = static_cast<TypeParam>(0.0); + } + } + } + + // Copy to device + Kokkos::deep_copy(this->p_cell_in, p_cell_in_h); + Kokkos::deep_copy(this->rbf_c2grad_idx, rbf_c2grad_idx_h); + Kokkos::deep_copy(this->rbf_c2grad_blk, rbf_c2grad_blk_h); + Kokkos::deep_copy(this->rbf_c2grad_coeff, rbf_c2grad_coeff_h); + Kokkos::deep_copy(this->grad_x, grad_x_h); + Kokkos::deep_copy(this->grad_y, grad_y_h); + + Kokkos::fence(); + + // Call the function rbf_interpol_c2grad_lib<TypeParam>( this->p_cell_in.data(), this->rbf_c2grad_idx.data(), this->rbf_c2grad_blk.data(), this->rbf_c2grad_coeff.data(), this->grad_x.data(), this->grad_y.data(), this->i_startblk, this->i_endblk, this->i_startidx_in, this->i_endidx_in, this->slev, - this->elev, this->nproma, this->rbf_c2grad_dim, this->nlev, this->nblks_c, - this->lacc); - - // For each block from i_startblk to i_endblk-1, and for each (i, level) - // the kernel sums rbf_c2grad_dim contributions, each equal to 1. - // Therefore, we expect grad_x and grad_y to equal rbf_c2grad_dim (i.e., 10). - for (int jb = this->i_startblk; jb < this->i_endblk; ++jb) { - for (int jk = 0; jk < this->nlev; ++jk) { - for (int i = 0; i < this->nproma; ++i) { - size_t idx = i + static_cast<size_t>(jk) * this->nproma + - static_cast<size_t>(jb) * this->nproma * this->nlev; - EXPECT_NEAR(this->grad_x[idx], - static_cast<TypeParam>(this->rbf_c2grad_dim), - static_cast<TypeParam>(1e-5)) - << "grad_x failure at block " << jb << ", level " << jk - << ", index " << i; - EXPECT_NEAR(this->grad_y[idx], - static_cast<TypeParam>(this->rbf_c2grad_dim), - static_cast<TypeParam>(1e-5)) - << "grad_y failure at block " << jb << ", level " << jk - << ", index " << i; + this->elev, nproma, rbf_c2grad_dim, nlev, nblks_c, this->lacc); + + Kokkos::fence(); + + // Copy results back to host + Kokkos::deep_copy(grad_x_h, this->grad_x); + Kokkos::deep_copy(grad_y_h, this->grad_y); + + // Expected results based on the specific test values + std::vector<TypeParam> expected_grad_x = { + 19.9225, 22.9275, 26.2225, 20.9675, 24.0725, 27.4675, + 22.0125, 25.2175, 28.7125, 23.0575, 26.3625, 29.9575, + 38.972, 42.977, 47.272, 41.017, 45.122, 49.517, + 43.062, 47.267, 51.762, 45.107, 49.412, 54.007 + }; + + std::vector<TypeParam> expected_grad_y = { + 38.9725, 42.9775, 47.2725, 41.0175, 45.1225, 49.5175, + 43.0625, 47.2675, 51.7625, 45.1075, 49.4125, 54.0075, + 58.022, 63.027, 68.322, 61.067, 66.172, 71.567, + 64.112, 69.317, 74.812, 67.157, 72.462, 78.057 + }; + + // Verify results + for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) { + int i_startidx, i_endidx; + get_indices_c_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb, + this->i_startblk, this->i_endblk, i_startidx, i_endidx); + + for (int jk = this->slev; jk <= this->elev; ++jk) { + for (int jc = i_startidx; jc <= i_endidx; ++jc) { + EXPECT_NEAR(grad_x_h[grad_at(jc, jk, jb)], + expected_grad_x[grad_at(jc, jk, jb)], + static_cast<TypeParam>(1e-5)) + << "grad_x failure at block " << jb << ", level " << jk << ", index " << jc; + EXPECT_NEAR(grad_y_h[grad_at(jc, jk, jb)], + expected_grad_y[grad_at(jc, jk, jb)], + static_cast<TypeParam>(1e-5)) + << "grad_y failure at block " << jb << ", level " << jk << ", index " << jc; } } } } -TYPED_TEST(RbfInterpolTypedTestFixture, Cell) { - using T = TypeParam; +TYPED_TEST(RbfInterpolSingleParamTest, C2GradRandom) { + constexpr int nproma = this->nproma; + constexpr int nlev = this->nlev; + constexpr int nblks_c = this->nblks_c; + constexpr int rbf_c2grad_dim = this->rbf_c2grad_dim; + + // Define indexing helpers + const auto &cell_at = at<nproma, nlev, nblks_c>; + const auto &idx_at = at<rbf_c2grad_dim, nproma, nblks_c>; + const auto &blk_at = at<rbf_c2grad_dim, nproma, nblks_c>; + const auto &coeff_at = at<rbf_c2grad_dim, 2, nproma, nblks_c>; + const auto &grad_at = at<nproma, nlev, nblks_c>; + + // Create host mirror views + auto p_cell_in_h = Kokkos::create_mirror_view(this->p_cell_in); + auto rbf_c2grad_idx_h = Kokkos::create_mirror_view(this->rbf_c2grad_idx); + auto rbf_c2grad_blk_h = Kokkos::create_mirror_view(this->rbf_c2grad_blk); + auto rbf_c2grad_coeff_h = Kokkos::create_mirror_view(this->rbf_c2grad_coeff); + auto grad_x_h = Kokkos::create_mirror_view(this->grad_x); + auto grad_y_h = Kokkos::create_mirror_view(this->grad_y); + + // Use fixed seed for reproducibility + std::mt19937 gen(42); + std::uniform_int_distribution<int> cell_distrib(0, nproma - 1); + std::uniform_int_distribution<int> block_distrib(0, nblks_c - 1); + std::uniform_real_distribution<double> real_distrib(0.01, 1.0); + std::uniform_real_distribution<double> coeff_distrib(-0.2, 0.2); // Allow negative coefficients for gradients + + // Initialize with random values + for (int ib = 0; ib < nblks_c; ++ib) { + for (int ik = 0; ik < nlev; ++ik) { + for (int ic = 0; ic < nproma; ++ic) { + p_cell_in_h[cell_at(ic, ik, ib)] = static_cast<TypeParam>(real_distrib(gen)); + } + } + } + + // Initialize connectivity indices with random values + for (int ib = 0; ib < nblks_c; ++ib) { + for (int ic = 0; ic < nproma; ++ic) { + // First index points to self + rbf_c2grad_idx_h[idx_at(0, ic, ib)] = ic; + rbf_c2grad_blk_h[blk_at(0, ic, ib)] = ib; + + // Other indices randomized + for (int j = 1; j < rbf_c2grad_dim; ++j) { + rbf_c2grad_idx_h[idx_at(j, ic, ib)] = cell_distrib(gen); + rbf_c2grad_blk_h[blk_at(j, ic, ib)] = block_distrib(gen); + } + + // Random coefficients for gradient reconstruction + for (int j = 0; j < rbf_c2grad_dim; ++j) { + rbf_c2grad_coeff_h[coeff_at(j, 0, ic, ib)] = static_cast<TypeParam>(coeff_distrib(gen)); // x coefficient + rbf_c2grad_coeff_h[coeff_at(j, 1, ic, ib)] = static_cast<TypeParam>(coeff_distrib(gen)); // y coefficient + } + } + } + + // Initialize gradients to zero + for (int ib = 0; ib < nblks_c; ++ib) { + for (int ik = 0; ik < nlev; ++ik) { + for (int ic = 0; ic < nproma; ++ic) { + grad_x_h[grad_at(ic, ik, ib)] = static_cast<TypeParam>(0.0); + grad_y_h[grad_at(ic, ik, ib)] = static_cast<TypeParam>(0.0); + } + } + } + + // Copy to device + Kokkos::deep_copy(this->p_cell_in, p_cell_in_h); + Kokkos::deep_copy(this->rbf_c2grad_idx, rbf_c2grad_idx_h); + Kokkos::deep_copy(this->rbf_c2grad_blk, rbf_c2grad_blk_h); + Kokkos::deep_copy(this->rbf_c2grad_coeff, rbf_c2grad_coeff_h); + Kokkos::deep_copy(this->grad_x, grad_x_h); + Kokkos::deep_copy(this->grad_y, grad_y_h); + + Kokkos::fence(); + + // Call the function + rbf_interpol_c2grad_lib<TypeParam>( + this->p_cell_in.data(), this->rbf_c2grad_idx.data(), + this->rbf_c2grad_blk.data(), this->rbf_c2grad_coeff.data(), + this->grad_x.data(), this->grad_y.data(), this->i_startblk, + this->i_endblk, this->i_startidx_in, this->i_endidx_in, this->slev, + this->elev, nproma, rbf_c2grad_dim, nlev, nblks_c, this->lacc); + + Kokkos::fence(); + + // Copy results back to host + Kokkos::deep_copy(grad_x_h, this->grad_x); + Kokkos::deep_copy(grad_y_h, this->grad_y); - rbf_vec_interpol_cell_lib<T>( + // Calculate expected values + using host_space = Kokkos::HostSpace; + Kokkos::View<TypeParam***, host_space> expected_grad_x("expected_grad_x", nproma, nlev, nblks_c); + Kokkos::View<TypeParam***, host_space> expected_grad_y("expected_grad_y", nproma, nlev, nblks_c); + + // Compute expected values + for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) { + int i_startidx, i_endidx; + get_indices_c_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb, + this->i_startblk, this->i_endblk, i_startidx, i_endidx); + + for (int jk = this->slev; jk <= this->elev; ++jk) { + for (int jc = i_startidx; jc <= i_endidx; ++jc) { + expected_grad_x(jc, jk, jb) = static_cast<TypeParam>(0.0); + expected_grad_y(jc, jk, jb) = static_cast<TypeParam>(0.0); + + for (int j = 0; j < rbf_c2grad_dim; ++j) { + int cell_idx = rbf_c2grad_idx_h[idx_at(j, jc, jb)]; + int cell_blk = rbf_c2grad_blk_h[blk_at(j, jc, jb)]; + TypeParam coeff_x = rbf_c2grad_coeff_h[coeff_at(j, 0, jc, jb)]; + TypeParam coeff_y = rbf_c2grad_coeff_h[coeff_at(j, 1, jc, jb)]; + + expected_grad_x(jc, jk, jb) += + coeff_x * p_cell_in_h[cell_at(cell_idx, jk, cell_blk)]; + expected_grad_y(jc, jk, jb) += + coeff_y * p_cell_in_h[cell_at(cell_idx, jk, cell_blk)]; + } + } + } + } + + TypeParam tol = std::is_same<TypeParam, float>::value ? + static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13); + + // Verify results + for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) { + int i_startidx, i_endidx; + get_indices_c_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb, + this->i_startblk, this->i_endblk, i_startidx, i_endidx); + + for (int jk = this->slev; jk <= this->elev; ++jk) { + for (int jc = i_startidx; jc <= i_endidx; ++jc) { + EXPECT_NEAR(grad_x_h[grad_at(jc, jk, jb)], + expected_grad_x(jc, jk, jb), tol) + << "grad_x failure at block " << jb << ", level " << jk << ", index " << jc; + EXPECT_NEAR(grad_y_h[grad_at(jc, jk, jb)], + expected_grad_y(jc, jk, jb), tol) + << "grad_y failure at block " << jb << ", level " << jk << ", index " << jc; + } + } + } +} + +//////////////////////////////////////////////////////////////////////////////// +// +// ! rbf_vec_interpol_cell +// +//////////////////////////////////////////////////////////////////////////////// + +TYPED_TEST(RbfInterpolSingleParamTest, CellSpecific) { + constexpr int nproma = this->nproma; + constexpr int nlev = this->nlev; + constexpr int nblks_c = this->nblks_c; + constexpr int nblks_e = this->nblks_e; + constexpr int rbf_vec_dim_c = this->rbf_vec_dim_c; + + // Define indexing helpers + const auto &edge_at = at<nproma, nlev, nblks_e>; + const auto &idx_at = at<rbf_vec_dim_c, nproma, nblks_c>; + const auto &blk_at = at<rbf_vec_dim_c, nproma, nblks_c>; + const auto &coeff_at = at<rbf_vec_dim_c, 2, nproma, nblks_c>; + const auto &cell_at = at<nproma, nlev, nblks_c>; + + // Create host mirror views + auto p_vn_in_h = Kokkos::create_mirror_view(this->p_vn_in); + auto rbf_vec_idx_c_h = Kokkos::create_mirror_view(this->rbf_vec_idx_c); + auto rbf_vec_blk_c_h = Kokkos::create_mirror_view(this->rbf_vec_blk_c); + auto rbf_vec_coeff_c_h = Kokkos::create_mirror_view(this->rbf_vec_coeff_c); + auto p_u_out_h = Kokkos::create_mirror_view(this->p_u_out); + auto p_v_out_h = Kokkos::create_mirror_view(this->p_v_out); + + // Initialize with index-based pattern for edge data + for (int ib = 0; ib < nblks_e; ++ib) { + for (int ik = 0; ik < nlev; ++ik) { + for (int ic = 0; ic < nproma; ++ic) { + // Value depends on all three indices + p_vn_in_h[edge_at(ic, ik, ib)] = static_cast<TypeParam>(1.0 + ic + ik * 0.1 + ib * 0.01); + } + } + } + + // Initialize cell connectivity indices with specific pattern + for (int ib = 0; ib < nblks_c; ++ib) { + for (int ic = 0; ic < nproma; ++ic) { + // Each cell connects to rbf_vec_dim_c edges + for (int j = 0; j < rbf_vec_dim_c; ++j) { + // Edge indices with a pattern + rbf_vec_idx_c_h[idx_at(j, ic, ib)] = (ic + j) % nproma; + rbf_vec_blk_c_h[blk_at(j, ic, ib)] = (ib + j % nblks_e) % nblks_e; + + // Interpolation coefficients that depend on indices + rbf_vec_coeff_c_h[coeff_at(j, 0, ic, ib)] = static_cast<TypeParam>(1.0 + ib + 0.1*ic + 0.01 * j); // x coefficient + rbf_vec_coeff_c_h[coeff_at(j, 1, ic, ib)] = static_cast<TypeParam>(2.0 + ib + 0.1*ic + 0.01 * j); // y coefficient + } + + // Initialize output to zero + for (int ik = 0; ik < nlev; ++ik) { + p_u_out_h[cell_at(ic, ik, ib)] = static_cast<TypeParam>(0.0); + p_v_out_h[cell_at(ic, ik, ib)] = static_cast<TypeParam>(0.0); + } + } + } + + // Copy to device + Kokkos::deep_copy(this->p_vn_in, p_vn_in_h); + Kokkos::deep_copy(this->rbf_vec_idx_c, rbf_vec_idx_c_h); + Kokkos::deep_copy(this->rbf_vec_blk_c, rbf_vec_blk_c_h); + Kokkos::deep_copy(this->rbf_vec_coeff_c, rbf_vec_coeff_c_h); + Kokkos::deep_copy(this->p_u_out, p_u_out_h); + Kokkos::deep_copy(this->p_v_out, p_v_out_h); + + Kokkos::fence(); + + // Call the function + rbf_vec_interpol_cell_lib<TypeParam>( this->p_vn_in.data(), this->rbf_vec_idx_c.data(), this->rbf_vec_blk_c.data(), this->rbf_vec_coeff_c.data(), this->p_u_out.data(), this->p_v_out.data(), this->i_startblk, this->i_endblk, this->i_startidx_in, this->i_endidx_in, this->slev, - this->elev, this->nproma, this->rbf_c2grad_dim, this->nlev, this->nblks_c, - this->nblks_e, this->lacc, this->acc_async); - - for (int jb = this->i_startblk; jb < this->i_endblk; ++jb) { - for (int jk = 0; jk < this->nlev; ++jk) { - for (int i = 0; i < this->nproma; ++i) { - size_t idx = i + static_cast<size_t>(jk) * this->nproma + - static_cast<size_t>(jb) * this->nproma * this->nlev; - EXPECT_NEAR(this->p_u_out[idx], static_cast<T>(this->rbf_vec_dim_c), - static_cast<T>(1e-5)) - << "p_u_out failure at block " << jb << ", level " << jk - << ", index " << i; + this->elev, nproma, nlev, nblks_c, nblks_e, rbf_vec_dim_c, + this->lacc, this->acc_async); + + Kokkos::fence(); + + // Copy results back to host + Kokkos::deep_copy(p_u_out_h, this->p_u_out); + Kokkos::deep_copy(p_v_out_h, this->p_v_out); + + // Expected results based on the specific test values + std::vector<TypeParam> expected_u = { + 18.8216, 20.5356, 22.3396, 19.7576, 21.5616, 23.4556, + 20.6936, 22.5876, 24.5716, 21.6296, 23.6136, 25.6876, + 36.882, 38.597, 40.402, 38.718, 40.523, 42.418, + 40.554, 42.449, 44.434, 42.39, 44.375, 46.45 + }; + std::vector<TypeParam> expected_v = { + 36.8616, 38.5756, 40.3796, 38.6976, 40.5016, 42.3956, + 40.5336, 42.4276, 44.4116, 42.3696, 44.3536, 46.4276, + 54.932, 56.647, 58.452, 57.668, 59.473, 61.368, + 60.404, 62.299, 64.284, 63.14, 65.125, 67.2 + }; + + // Verify results + for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) { + int i_startidx, i_endidx; + get_indices_c_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb, + this->i_startblk, this->i_endblk, i_startidx, i_endidx); + + for (int jk = this->slev; jk <= this->elev; ++jk) { + for (int jc = i_startidx; jc <= i_endidx; ++jc) { + EXPECT_NEAR(p_u_out_h[cell_at(jc, jk, jb)], + expected_u[cell_at(jc, jk, jb)], + static_cast<TypeParam>(1e-5)) + << "u failure at block " << jb << ", level " << jk << ", index " << jc; + + EXPECT_NEAR(p_v_out_h[cell_at(jc, jk, jb)], + expected_v[cell_at(jc, jk, jb)], + static_cast<TypeParam>(1e-5)) + << "v failure at block " << jb << ", level " << jk << ", index " << jc; } } } } -TYPED_TEST(RbfInterpolTypedTestFixture, Edge) { - using T = TypeParam; +TYPED_TEST(RbfInterpolSingleParamTest, CellRandom) { + constexpr int nproma = this->nproma; + constexpr int nlev = this->nlev; + constexpr int nblks_c = this->nblks_c; + constexpr int nblks_e = this->nblks_e; + constexpr int rbf_vec_dim_c = this->rbf_vec_dim_c; + + // Define indexing helpers + const auto &edge_at = at<nproma, nlev, nblks_e>; + const auto &idx_at = at<rbf_vec_dim_c, nproma, nblks_c>; + const auto &blk_at = at<rbf_vec_dim_c, nproma, nblks_c>; + const auto &coeff_at = at<rbf_vec_dim_c, 2, nproma, nblks_c>; + const auto &cell_at = at<nproma, nlev, nblks_c>; + + // Create host mirror views + auto p_vn_in_h = Kokkos::create_mirror_view(this->p_vn_in); + auto rbf_vec_idx_c_h = Kokkos::create_mirror_view(this->rbf_vec_idx_c); + auto rbf_vec_blk_c_h = Kokkos::create_mirror_view(this->rbf_vec_blk_c); + auto rbf_vec_coeff_c_h = Kokkos::create_mirror_view(this->rbf_vec_coeff_c); + auto p_u_out_h = Kokkos::create_mirror_view(this->p_u_out); + auto p_v_out_h = Kokkos::create_mirror_view(this->p_v_out); + + // Use fixed seed for reproducibility + std::mt19937 gen(43); // Different seed from other tests + std::uniform_int_distribution<int> edge_distrib(0, nproma - 1); + std::uniform_int_distribution<int> block_distrib(0, nblks_e - 1); + std::uniform_real_distribution<double> real_distrib(0.01, 1.0); + std::uniform_real_distribution<double> coeff_distrib(0.01, 0.2); + + // Initialize with random values + for (int ib = 0; ib < nblks_e; ++ib) { + for (int ik = 0; ik < nlev; ++ik) { + for (int ic = 0; ic < nproma; ++ic) { + p_vn_in_h[edge_at(ic, ik, ib)] = static_cast<TypeParam>(real_distrib(gen)); + } + } + } + + // Initialize cell connectivity indices with random values + for (int ib = 0; ib < nblks_c; ++ib) { + for (int ic = 0; ic < nproma; ++ic) { + for (int j = 0; j < rbf_vec_dim_c; ++j) { + // Random edge indices and blocks + rbf_vec_idx_c_h[idx_at(j, ic, ib)] = edge_distrib(gen); + rbf_vec_blk_c_h[blk_at(j, ic, ib)] = block_distrib(gen); + // Random coefficients for interpolation + rbf_vec_coeff_c_h[coeff_at(j, 0, ic, ib)] = static_cast<TypeParam>(coeff_distrib(gen)); + rbf_vec_coeff_c_h[coeff_at(j, 1, ic, ib)] = static_cast<TypeParam>(coeff_distrib(gen)); + } + + // Initialize output to zero + for (int ik = 0; ik < nlev; ++ik) { + p_u_out_h[cell_at(ic, ik, ib)] = static_cast<TypeParam>(0.0); + p_v_out_h[cell_at(ic, ik, ib)] = static_cast<TypeParam>(0.0); + } + } + } + + // Copy to device + Kokkos::deep_copy(this->p_vn_in, p_vn_in_h); + Kokkos::deep_copy(this->rbf_vec_idx_c, rbf_vec_idx_c_h); + Kokkos::deep_copy(this->rbf_vec_blk_c, rbf_vec_blk_c_h); + Kokkos::deep_copy(this->rbf_vec_coeff_c, rbf_vec_coeff_c_h); + Kokkos::deep_copy(this->p_u_out, p_u_out_h); + Kokkos::deep_copy(this->p_v_out, p_v_out_h); + + Kokkos::fence(); + + // Call the function + rbf_vec_interpol_cell_lib<TypeParam>( + this->p_vn_in.data(), this->rbf_vec_idx_c.data(), + this->rbf_vec_blk_c.data(), this->rbf_vec_coeff_c.data(), + this->p_u_out.data(), this->p_v_out.data(), this->i_startblk, + this->i_endblk, this->i_startidx_in, this->i_endidx_in, this->slev, + this->elev, nproma, nlev, nblks_c, nblks_e, rbf_vec_dim_c, + this->lacc, this->acc_async); + + Kokkos::fence(); + + // Copy results back to host + Kokkos::deep_copy(p_u_out_h, this->p_u_out); + Kokkos::deep_copy(p_v_out_h, this->p_v_out); + + // Calculate expected values + using host_space = Kokkos::HostSpace; + Kokkos::View<TypeParam***, host_space> expected_u("expected_u", nproma, nlev, nblks_c); + Kokkos::View<TypeParam***, host_space> expected_v("expected_v", nproma, nlev, nblks_c); + + // Compute expected values + for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) { + int i_startidx, i_endidx; + get_indices_c_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb, + this->i_startblk, this->i_endblk, i_startidx, i_endidx); + + for (int jk = this->slev; jk <= this->elev; ++jk) { + for (int jc = i_startidx; jc <= i_endidx; ++jc) { + expected_u(jc, jk, jb) = static_cast<TypeParam>(0.0); + expected_v(jc, jk, jb) = static_cast<TypeParam>(0.0); + + for (int j = 0; j < rbf_vec_dim_c; ++j) { + int edge_idx = rbf_vec_idx_c_h[idx_at(j, jc, jb)]; + int edge_blk = rbf_vec_blk_c_h[blk_at(j, jc, jb)]; + TypeParam coeff_u = rbf_vec_coeff_c_h[coeff_at(j, 0, jc, jb)]; + TypeParam coeff_v = rbf_vec_coeff_c_h[coeff_at(j, 1, jc, jb)]; + + expected_u(jc, jk, jb) += + coeff_u * p_vn_in_h[edge_at(edge_idx, jk, edge_blk)]; + expected_v(jc, jk, jb) += + coeff_v * p_vn_in_h[edge_at(edge_idx, jk, edge_blk)]; + } + } + } + } + + TypeParam tol = std::is_same<TypeParam, float>::value ? + static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13); - rbf_vec_interpol_edge_lib<T>( + // Verify results + for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) { + int i_startidx, i_endidx; + get_indices_c_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb, + this->i_startblk, this->i_endblk, i_startidx, i_endidx); + + for (int jk = this->slev; jk <= this->elev; ++jk) { + for (int jc = i_startidx; jc <= i_endidx; ++jc) { + EXPECT_NEAR(p_u_out_h[cell_at(jc, jk, jb)], + expected_u(jc, jk, jb), tol) + << "u failure at block " << jb << ", level " << jk << ", index " << jc; + + EXPECT_NEAR(p_v_out_h[cell_at(jc, jk, jb)], + expected_v(jc, jk, jb), tol) + << "v failure at block " << jb << ", level " << jk << ", index " << jc; + } + } + } +} + +//////////////////////////////////////////////////////////////////////////////// +// +// ! rbf_vec_interpol_edge +// +//////////////////////////////////////////////////////////////////////////////// + +TYPED_TEST(RbfInterpolSingleParamTest, EdgeSpecific) { + constexpr int nproma = this->nproma; + constexpr int nlev = this->nlev; + constexpr int nblks_e = this->nblks_e; + constexpr int rbf_vec_dim_e = this->rbf_vec_dim_e; + + // Define indexing helpers + const auto &vn_at = at<nproma, nlev, nblks_e>; + const auto &idx_at = at<rbf_vec_dim_e, nproma, nblks_e>; + const auto &blk_at = at<rbf_vec_dim_e, nproma, nblks_e>; + const auto &coeff_at = at<rbf_vec_dim_e, nproma, nblks_e>; + const auto &vt_at = at<nproma, nlev, nblks_e>; + + // Create host mirror views + auto p_vn_in_h = Kokkos::create_mirror_view(this->p_vn_in); + auto rbf_vec_idx_e_h = Kokkos::create_mirror_view(this->rbf_vec_idx_e); + auto rbf_vec_blk_e_h = Kokkos::create_mirror_view(this->rbf_vec_blk_e); + auto rbf_vec_coeff_e_h = Kokkos::create_mirror_view(this->rbf_vec_coeff_e); + auto p_vt_out_h = Kokkos::create_mirror_view(this->p_vt_out); + + // Initialize with index-based pattern for edge data + for (int ib = 0; ib < nblks_e; ++ib) { + for (int ik = 0; ik < nlev; ++ik) { + for (int ic = 0; ic < nproma; ++ic) { + // Value depends on all three indices + p_vn_in_h[vn_at(ic, ik, ib)] = static_cast<TypeParam>(1.0 + ic + ik * 0.1 + ib * 0.01); + } + } + } + + // Initialize edge connectivity indices with specific pattern + for (int ib = 0; ib < nblks_e; ++ib) { + for (int ic = 0; ic < nproma; ++ic) { + // Each edge uses rbf_vec_dim_e neighboring edges + for (int j = 0; j < rbf_vec_dim_e; ++j) { + // Edge indices with a pattern + rbf_vec_idx_e_h[idx_at(j, ic, ib)] = (ic + j) % nproma; + rbf_vec_blk_e_h[blk_at(j, ic, ib)] = (ib + j % nblks_e) % nblks_e; + + // Interpolation coefficients that depend on indices + rbf_vec_coeff_e_h[coeff_at(j, ic, ib)] = static_cast<TypeParam>(1.0 + ib + 0.1*ic + 0.01 * j); // coefficient + } + + // Initialize output to zero + for (int ik = 0; ik < nlev; ++ik) { + p_vt_out_h[vt_at(ic, ik, ib)] = static_cast<TypeParam>(0.0); + } + } + } + + // Copy to device + Kokkos::deep_copy(this->p_vn_in, p_vn_in_h); + Kokkos::deep_copy(this->rbf_vec_idx_e, rbf_vec_idx_e_h); + Kokkos::deep_copy(this->rbf_vec_blk_e, rbf_vec_blk_e_h); + Kokkos::deep_copy(this->rbf_vec_coeff_e, rbf_vec_coeff_e_h); + Kokkos::deep_copy(this->p_vt_out, p_vt_out_h); + + Kokkos::fence(); + + // Call the function + rbf_vec_interpol_edge_lib<TypeParam>( this->p_vn_in.data(), this->rbf_vec_idx_e.data(), this->rbf_vec_blk_e.data(), this->rbf_vec_coeff_e.data(), this->p_vt_out.data(), this->i_startblk, this->i_endblk, this->i_startidx_in, this->i_endidx_in, this->slev, this->elev, - this->nlev, this->nproma, this->rbf_vec_dim_e, this->nblks_e, this->lacc, - this->acc_async); - - for (int jb = this->i_startblk; jb < this->i_endblk; ++jb) { - for (int jk = 0; jk < this->nlev; ++jk) { - for (int i = 0; i < this->nproma; ++i) { - size_t idx = i + static_cast<size_t>(jk) * this->nproma + - static_cast<size_t>(jb) * this->nproma * this->nlev; - EXPECT_NEAR(this->p_vt_out[idx], static_cast<T>(this->rbf_vec_dim_e), - static_cast<T>(1e-5)) - << "p_vt_out failure at block " << jb << ", level " << jk - << ", index " << i; + nlev, nproma, rbf_vec_dim_e, nblks_e, this->lacc, this->acc_async); + + Kokkos::fence(); + + // Copy results back to host + Kokkos::deep_copy(p_vt_out_h, this->p_vt_out); + + // Expected results based on the specific test values + std::vector<TypeParam> expected_vt = { + 7.1304, 8.9324, 10.9644, 7.5364, 9.3784, 11.4504, + 7.9424, 9.8244, 11.9364, 8.3484, 10.2704, 12.4224, + 14.1502, 16.9522, 19.9842, 14.9562, 17.7982, 20.8702, + 15.7622, 18.6442, 21.7562, 16.5682, 19.4902, 22.6422, + }; + + // Verify results + for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) { + int i_startidx, i_endidx; + get_indices_c_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb, + this->i_startblk, this->i_endblk, i_startidx, i_endidx); + + for (int jk = this->slev; jk <= this->elev; ++jk) { + for (int je = i_startidx; je <= i_endidx; ++je) { + EXPECT_NEAR(p_vt_out_h[vt_at(je, jk, jb)], + expected_vt[vt_at(je, jk, jb)], + static_cast<TypeParam>(1e-5)) + << "Tangential velocity failure at block " << jb << ", level " << jk << ", index " << je; + } + } + } +} + +TYPED_TEST(RbfInterpolSingleParamTest, EdgeRandom) { + constexpr int nproma = this->nproma; + constexpr int nlev = this->nlev; + constexpr int nblks_e = this->nblks_e; + constexpr int rbf_vec_dim_e = this->rbf_vec_dim_e; + + // Define indexing helpers + const auto &vn_at = at<nproma, nlev, nblks_e>; + const auto &idx_at = at<rbf_vec_dim_e, nproma, nblks_e>; + const auto &blk_at = at<rbf_vec_dim_e, nproma, nblks_e>; + const auto &coeff_at = at<rbf_vec_dim_e, nproma, nblks_e>; + const auto &vt_at = at<nproma, nlev, nblks_e>; + + // Create host mirror views + auto p_vn_in_h = Kokkos::create_mirror_view(this->p_vn_in); + auto rbf_vec_idx_e_h = Kokkos::create_mirror_view(this->rbf_vec_idx_e); + auto rbf_vec_blk_e_h = Kokkos::create_mirror_view(this->rbf_vec_blk_e); + auto rbf_vec_coeff_e_h = Kokkos::create_mirror_view(this->rbf_vec_coeff_e); + auto p_vt_out_h = Kokkos::create_mirror_view(this->p_vt_out); + + // Use fixed seed for reproducibility + std::mt19937 gen(44); // Different seed from other tests + std::uniform_int_distribution<int> edge_distrib(0, nproma - 1); + std::uniform_int_distribution<int> block_distrib(0, nblks_e - 1); + std::uniform_real_distribution<double> real_distrib(0.01, 1.0); + std::uniform_real_distribution<double> coeff_distrib(0.01, 0.5); + + // Initialize with random values + for (int ib = 0; ib < nblks_e; ++ib) { + for (int ik = 0; ik < nlev; ++ik) { + for (int ic = 0; ic < nproma; ++ic) { + p_vn_in_h[vn_at(ic, ik, ib)] = static_cast<TypeParam>(real_distrib(gen)); + } + } + } + + // Initialize edge connectivity indices with random values + for (int ib = 0; ib < nblks_e; ++ib) { + for (int ic = 0; ic < nproma; ++ic) { + + for (int j = 0; j < rbf_vec_dim_e; ++j) { + // Random edge indices and blocks + rbf_vec_idx_e_h[idx_at(j, ic, ib)] = edge_distrib(gen); + rbf_vec_blk_e_h[blk_at(j, ic, ib)] = block_distrib(gen); + // Random coefficients for interpolation + rbf_vec_coeff_e_h[coeff_at(j, ic, ib)] = static_cast<TypeParam>(coeff_distrib(gen)); + } + + // Initialize output to zero + for (int ik = 0; ik < nlev; ++ik) { + p_vt_out_h[vt_at(ic, ik, ib)] = static_cast<TypeParam>(0.0); + } + } + } + + // Copy to device + Kokkos::deep_copy(this->p_vn_in, p_vn_in_h); + Kokkos::deep_copy(this->rbf_vec_idx_e, rbf_vec_idx_e_h); + Kokkos::deep_copy(this->rbf_vec_blk_e, rbf_vec_blk_e_h); + Kokkos::deep_copy(this->rbf_vec_coeff_e, rbf_vec_coeff_e_h); + Kokkos::deep_copy(this->p_vt_out, p_vt_out_h); + + Kokkos::fence(); + + // Call the function + rbf_vec_interpol_edge_lib<TypeParam>( + this->p_vn_in.data(), this->rbf_vec_idx_e.data(), + this->rbf_vec_blk_e.data(), this->rbf_vec_coeff_e.data(), + this->p_vt_out.data(), this->i_startblk, this->i_endblk, + this->i_startidx_in, this->i_endidx_in, this->slev, this->elev, + nlev, nproma, rbf_vec_dim_e, nblks_e, this->lacc, this->acc_async); + + Kokkos::fence(); + + // Copy results back to host + Kokkos::deep_copy(p_vt_out_h, this->p_vt_out); + + // Calculate expected values + using host_space = Kokkos::HostSpace; + Kokkos::View<TypeParam***, host_space> expected_vt("expected_vt", nproma, nlev, nblks_e); + + // Compute expected values + for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) { + int i_startidx, i_endidx; + get_indices_c_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb, + this->i_startblk, this->i_endblk, i_startidx, i_endidx); + + for (int jk = this->slev; jk <= this->elev; ++jk) { + for (int je = i_startidx; je <= i_endidx; ++je) { + expected_vt(je, jk, jb) = static_cast<TypeParam>(0.0); + + for (int j = 0; j < rbf_vec_dim_e; ++j) { + int edge_idx = rbf_vec_idx_e_h[idx_at(j, je, jb)]; + int edge_blk = rbf_vec_blk_e_h[blk_at(j, je, jb)]; + TypeParam coeff = rbf_vec_coeff_e_h[coeff_at(j, je, jb)]; + + expected_vt(je, jk, jb) += + coeff * p_vn_in_h[vn_at(edge_idx, jk, edge_blk)]; + } + } + } + } + + TypeParam tol = std::is_same<TypeParam, float>::value ? + static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13); + + // Verify results + for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) { + int i_startidx, i_endidx; + get_indices_c_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb, + this->i_startblk, this->i_endblk, i_startidx, i_endidx); + + for (int jk = this->slev; jk <= this->elev; ++jk) { + for (int je = i_startidx; je <= i_endidx; ++je) { + EXPECT_NEAR(p_vt_out_h[vt_at(je, jk, jb)], + expected_vt(je, jk, jb), tol) + << "Tangential velocity failure at block " << jb << ", level " << jk << ", index " << je; } } } } -// Define a typed test fixture for the functions which have different input and -// output types template <typename TypePair> -class RbfVecInterpolMixedTestFixture : public ::testing::Test, +class RbfVecInterpolDoubleParamTest : public ::testing::Test, public interp_dimensions { public: using InType = typename TypePair::in_type; using OutType = typename TypePair::out_type; - // Constant dimensions. - static constexpr int nproma = 3; // inner loop length - static constexpr int nlev = 4; // number of vertical levels - static constexpr int nblks_e = 2; // number of edge blocks (for p_e_in) - static constexpr int nblks_v = - 2; // number of vertex blocks (for rbf arrays and outputs) - static constexpr int rbf_vec_dim = - 6; // fixed dimension for rbf vector (stencil points) + // Using Kokkos execution and memory spaces + using exec_space = Kokkos::DefaultExecutionSpace; + using memory_space = exec_space::memory_space; + + // Kokkos Views for test data + Kokkos::View<InType*, memory_space> p_e_in; // Dimensions: (nproma, nlev, nblks_e) + Kokkos::View<int*, memory_space> rbf_vec_idx_v; // Dimensions: (rbf_vec_dim_v, nproma, nblks_v) + Kokkos::View<int*, memory_space> rbf_vec_blk_v; // Dimensions: (rbf_vec_dim_v, nproma, nblks_v) + Kokkos::View<InType*, memory_space> rbf_vec_coeff_v; // Dimensions: (rbf_vec_dim_v, 2, nproma, nblks_v) + Kokkos::View<OutType*, memory_space> p_u_out; // Dimensions: (nproma, nlev, nblks_v) + Kokkos::View<OutType*, memory_space> p_v_out; // Dimensions: (nproma, nlev, nblks_v) - // Parameter values. - int i_startblk = 0; - int i_endblk = 1; // Test blocks [0, 1] - int i_startidx_in = 0; - int i_endidx_in = nproma - 1; // Full range: 0 .. nproma-1 - int slev = 0; - int elev = nlev - 1; // Full vertical range (0 .. nlev-1) - bool lacc = false; // Not using ACC-specific behavior. - bool acc_async = false; // No asynchronous execution. - - // Arrays stored in std::vector. - std::vector<InType> p_e_in; // Dimensions: (nproma, nlev, nblks_e) - std::vector<int> rbf_vec_idx_v; // Dimensions: (rbf_vec_dim, nproma, nblks_v) - std::vector<int> rbf_vec_blk_v; // Dimensions: (rbf_vec_dim, nproma, nblks_v) - std::vector<InType> - rbf_vec_coeff_v; // Dimensions: (rbf_vec_dim, 2, nproma, nblks_v) - std::vector<OutType> p_u_out; // Dimensions: (nproma, nlev, nblks_v) - std::vector<OutType> p_v_out; // Dimensions: (nproma, nlev, nblks_v) - - RbfVecInterpolMixedTestFixture() { - // Allocate and initialize inputs. - p_e_in.resize(num_elements_3d<InType>(nproma, nlev, nblks_e), - static_cast<InType>(1)); - rbf_vec_idx_v.resize(num_elements_3d<int>(rbf_vec_dim, nproma, nblks_v), 1); - rbf_vec_blk_v.resize(num_elements_3d<int>(rbf_vec_dim, nproma, nblks_v), 0); - rbf_vec_coeff_v.resize( - num_elements_4d<InType>(rbf_vec_dim, 2, nproma, nblks_v), - static_cast<InType>(1)); - - // Allocate output arrays and initialize to zero. - p_u_out.resize(num_elements_3d<OutType>(nproma, nlev, nblks_v), - static_cast<OutType>(0)); - p_v_out.resize(num_elements_3d<OutType>(nproma, nlev, nblks_v), - static_cast<OutType>(0)); - } + RbfVecInterpolDoubleParamTest() + : p_e_in("p_e_in", nproma * nlev * nblks_e), + rbf_vec_idx_v("rbf_vec_idx_v", rbf_vec_dim_v * nproma * nblks_v), + rbf_vec_blk_v("rbf_vec_blk_v", rbf_vec_dim_v * nproma * nblks_v), + rbf_vec_coeff_v("rbf_vec_coeff_v", rbf_vec_dim_v * 2 * nproma * nblks_v), + p_u_out("p_u_out", nproma * nlev * nblks_v), + p_v_out("p_v_out", nproma * nlev * nblks_v) + {} }; -TYPED_TEST_SUITE(RbfVecInterpolMixedTestFixture, MixedTypes); +TYPED_TEST_SUITE(RbfVecInterpolDoubleParamTest, MixedTypes); + +//////////////////////////////////////////////////////////////////////////////// +// +// ! rbf_vec_interpol_vertex +// +//////////////////////////////////////////////////////////////////////////////// -TYPED_TEST(RbfVecInterpolMixedTestFixture, Vertex) { +TYPED_TEST(RbfVecInterpolDoubleParamTest, VertexSpecific) { using InType = typename TestFixture::InType; using OutType = typename TestFixture::OutType; + + constexpr int nproma = this->nproma; + constexpr int nlev = this->nlev; + constexpr int nblks_e = this->nblks_e; + constexpr int nblks_v = this->nblks_v; + constexpr int rbf_vec_dim_v = this->rbf_vec_dim_v; + + // Define indexing helpers + const auto &edge_at = at<nproma, nlev, nblks_e>; + const auto &idx_at = at<rbf_vec_dim_v, nproma, nblks_v>; + const auto &blk_at = at<rbf_vec_dim_v, nproma, nblks_v>; + const auto &coeff_at = at<rbf_vec_dim_v, 2, nproma, nblks_v>; + const auto &vert_at = at<nproma, nlev, nblks_v>; + + // Create host mirror views + auto p_e_in_h = Kokkos::create_mirror_view(this->p_e_in); + auto rbf_vec_idx_v_h = Kokkos::create_mirror_view(this->rbf_vec_idx_v); + auto rbf_vec_blk_v_h = Kokkos::create_mirror_view(this->rbf_vec_blk_v); + auto rbf_vec_coeff_v_h = Kokkos::create_mirror_view(this->rbf_vec_coeff_v); + auto p_u_out_h = Kokkos::create_mirror_view(this->p_u_out); + auto p_v_out_h = Kokkos::create_mirror_view(this->p_v_out); + + // Initialize with index-based pattern for edge data + for (int ib = 0; ib < nblks_e; ++ib) { + for (int ik = 0; ik < nlev; ++ik) { + for (int ic = 0; ic < nproma; ++ic) { + // Value depends on all three indices + p_e_in_h[edge_at(ic, ik, ib)] = static_cast<InType>(1.0 + ic + ik * 0.1 + ib * 0.01); + } + } + } + + // Initialize vertex connectivity indices with specific pattern + for (int ib = 0; ib < nblks_v; ++ib) { + for (int ic = 0; ic < nproma; ++ic) { + // Each vertex connects to 6 edges + for (int j = 0; j < rbf_vec_dim_v; ++j) { + // Edge indices with a pattern + rbf_vec_idx_v_h[idx_at(j, ic, ib)] = (ic + j) % nproma; + rbf_vec_blk_v_h[blk_at(j, ic, ib)] = (ib + j % nblks_e) % nblks_e; + + // Interpolation coefficients that depend on indices + rbf_vec_coeff_v_h[coeff_at(j, 0, ic, ib)] = static_cast<InType>(1.0 + ib + 0.1*ic + 0.01 * j); // x coefficient + rbf_vec_coeff_v_h[coeff_at(j, 1, ic, ib)] = static_cast<InType>(2.0 + ib + 0.1*ic + 0.01 * j); // y coefficient + } + + // Initialize output to zero + for (int ik = 0; ik < nlev; ++ik) { + p_u_out_h[vert_at(ic, ik, ib)] = static_cast<OutType>(0.0); + p_v_out_h[vert_at(ic, ik, ib)] = static_cast<OutType>(0.0); + } + } + } + + // Copy to device + Kokkos::deep_copy(this->p_e_in, p_e_in_h); + Kokkos::deep_copy(this->rbf_vec_idx_v, rbf_vec_idx_v_h); + Kokkos::deep_copy(this->rbf_vec_blk_v, rbf_vec_blk_v_h); + Kokkos::deep_copy(this->rbf_vec_coeff_v, rbf_vec_coeff_v_h); + Kokkos::deep_copy(this->p_u_out, p_u_out_h); + Kokkos::deep_copy(this->p_v_out, p_v_out_h); + + Kokkos::fence(); - // Call the function with mixed precision. + // Call the function rbf_vec_interpol_vertex_lib<InType, OutType>( this->p_e_in.data(), this->rbf_vec_idx_v.data(), this->rbf_vec_blk_v.data(), this->rbf_vec_coeff_v.data(), this->p_u_out.data(), this->p_v_out.data(), this->i_startblk, this->i_endblk, this->i_startidx_in, this->i_endidx_in, this->slev, - this->elev, this->nproma, this->lacc, this->acc_async, this->nlev, - this->nblks_e, this->nblks_v); - - // Check the outputs only for blocks in the range [i_startblk, i_endblk]. - for (int block = this->i_startblk; block <= this->i_endblk; ++block) { - for (int level = 0; level < this->nlev; ++level) { - for (int i = 0; i < this->nproma; ++i) { - // Compute the linear index for a 3D array in column-major order: - size_t idx = - i + level * this->nproma + block * this->nproma * this->nlev; - // Since every contribution is 1 and there are 6 stencil points, - // expect 6. - EXPECT_NEAR(this->p_u_out[idx], static_cast<OutType>(6), - static_cast<OutType>(1e-5)) - << "Failure at block " << block << ", level " << level << ", index " - << i; - EXPECT_NEAR(this->p_v_out[idx], static_cast<OutType>(6), - static_cast<OutType>(1e-5)) - << "Failure at block " << block << ", level " << level << ", index " - << i; + this->elev, nproma, this->lacc, this->acc_async, nlev, + nblks_e, nblks_v); + + Kokkos::fence(); + + // Copy results back to host + Kokkos::deep_copy(p_u_out_h, this->p_u_out); + Kokkos::deep_copy(p_v_out_h, this->p_v_out); + + // Expected results based on the specific test values + std::vector<OutType> expected_u = { + 12.3709, 13.5139, 14.7169, 12.9859, 14.1889, 15.4519, + 13.6009, 14.8639, 16.1869, 14.2159, 15.5389, 16.9219, + 24.4006, 25.5436, 26.7466, 25.6156, 26.8186, 28.0816, + 26.8306, 28.0936, 29.4166, 28.0456, 29.3686, 30.7516 + }; + std::vector<OutType> expected_v = { + 24.4009, 25.5439, 26.7469, 25.6159, 26.8189, 28.0819, + 26.8309, 28.0939, 29.4169, 28.0459, 29.3689, 30.7519, + 36.4306, 37.5736, 38.7766, 38.2456, 39.4486, 40.7116, + 40.0606, 41.3236, 42.6466, 41.8756, 43.1986, 44.5816 + }; + + // Verify results + for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) { + int i_startidx, i_endidx; + get_indices_v_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb, + this->i_startblk, this->i_endblk, i_startidx, i_endidx); + + for (int jk = this->slev; jk <= this->elev; ++jk) { + for (int jv = i_startidx; jv <= i_endidx; ++jv) { + EXPECT_NEAR(p_u_out_h[vert_at(jv, jk, jb)], + expected_u[vert_at(jv, jk, jb)], + static_cast<OutType>(1e-5)) + << "u failure at block " << jb << ", level " << jk << ", index " << jv; + EXPECT_NEAR(p_v_out_h[vert_at(jv, jk, jb)], + expected_v[vert_at(jv, jk, jb)], + static_cast<OutType>(1e-5)) + << "v failure at block " << jb << ", level " << jk << ", index " << jv; + } + } + } +} + +TYPED_TEST(RbfVecInterpolDoubleParamTest, VertexRandom) { + using InType = typename TestFixture::InType; + using OutType = typename TestFixture::OutType; + + constexpr int nproma = this->nproma; + constexpr int nlev = this->nlev; + constexpr int nblks_e = this->nblks_e; + constexpr int nblks_v = this->nblks_v; + constexpr int rbf_vec_dim_v = 6; // Fixed dimension for RBF + + // Define indexing helpers + const auto &edge_at = at<nproma, nlev, nblks_e>; + const auto &idx_at = at<rbf_vec_dim_v, nproma, nblks_v>; + const auto &blk_at = at<rbf_vec_dim_v, nproma, nblks_v>; + const auto &coeff_at = at<rbf_vec_dim_v, 2, nproma, nblks_v>; + const auto &vert_at = at<nproma, nlev, nblks_v>; + + // Create host mirror views + auto p_e_in_h = Kokkos::create_mirror_view(this->p_e_in); + auto rbf_vec_idx_v_h = Kokkos::create_mirror_view(this->rbf_vec_idx_v); + auto rbf_vec_blk_v_h = Kokkos::create_mirror_view(this->rbf_vec_blk_v); + auto rbf_vec_coeff_v_h = Kokkos::create_mirror_view(this->rbf_vec_coeff_v); + auto p_u_out_h = Kokkos::create_mirror_view(this->p_u_out); + auto p_v_out_h = Kokkos::create_mirror_view(this->p_v_out); + + // Use fixed seed for reproducibility + std::mt19937 gen(42); + std::uniform_int_distribution<int> edge_distrib(0, nproma - 1); + std::uniform_int_distribution<int> block_distrib(0, nblks_e - 1); + std::uniform_real_distribution<double> real_distrib(0.01, 1.0); + + // Initialize with random values + for (int ib = 0; ib < nblks_e; ++ib) { + for (int ik = 0; ik < nlev; ++ik) { + for (int ic = 0; ic < nproma; ++ic) { + p_e_in_h[edge_at(ic, ik, ib)] = static_cast<InType>(real_distrib(gen)); + } + } + } + + // Initialize vertex connectivity indices with random values + for (int ib = 0; ib < nblks_v; ++ib) { + for (int ic = 0; ic < nproma; ++ic) { + + for (int j = 0; j < rbf_vec_dim_v; ++j) { + // Random edge indices and blocks + rbf_vec_idx_v_h[idx_at(j, ic, ib)] = edge_distrib(gen); + rbf_vec_blk_v_h[blk_at(j, ic, ib)] = block_distrib(gen); + // Random coefficients for interpolation + rbf_vec_coeff_v_h[coeff_at(j, 0, ic, ib)] = static_cast<InType>(real_distrib(gen)); + rbf_vec_coeff_v_h[coeff_at(j, 1, ic, ib)] = static_cast<InType>(real_distrib(gen)); + } + + // Initialize output to zero + for (int ik = 0; ik < nlev; ++ik) { + p_u_out_h[vert_at(ic, ik, ib)] = static_cast<OutType>(0.0); + p_v_out_h[vert_at(ic, ik, ib)] = static_cast<OutType>(0.0); + } + } + } + + // Copy to device + Kokkos::deep_copy(this->p_e_in, p_e_in_h); + Kokkos::deep_copy(this->rbf_vec_idx_v, rbf_vec_idx_v_h); + Kokkos::deep_copy(this->rbf_vec_blk_v, rbf_vec_blk_v_h); + Kokkos::deep_copy(this->rbf_vec_coeff_v, rbf_vec_coeff_v_h); + Kokkos::deep_copy(this->p_u_out, p_u_out_h); + Kokkos::deep_copy(this->p_v_out, p_v_out_h); + + Kokkos::fence(); + + // Call the function + rbf_vec_interpol_vertex_lib<InType, OutType>( + this->p_e_in.data(), this->rbf_vec_idx_v.data(), + this->rbf_vec_blk_v.data(), this->rbf_vec_coeff_v.data(), + this->p_u_out.data(), this->p_v_out.data(), this->i_startblk, + this->i_endblk, this->i_startidx_in, this->i_endidx_in, this->slev, + this->elev, nproma, this->lacc, this->acc_async, nlev, + nblks_e, nblks_v); + + Kokkos::fence(); + + // Copy results back to host + Kokkos::deep_copy(p_u_out_h, this->p_u_out); + Kokkos::deep_copy(p_v_out_h, this->p_v_out); + + // Calculate expected values + using host_space = Kokkos::HostSpace; + Kokkos::View<OutType***, host_space> expected_u("expected_u", nproma, nlev, nblks_v); + Kokkos::View<OutType***, host_space> expected_v("expected_v", nproma, nlev, nblks_v); + + // Compute expected values + for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) { + int i_startidx, i_endidx; + get_indices_v_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb, + this->i_startblk, this->i_endblk, i_startidx, i_endidx); + + for (int jk = this->slev; jk <= this->elev; ++jk) { + for (int jv = i_startidx; jv <= i_endidx; ++jv) { + expected_u(jv, jk, jb) = static_cast<OutType>(0.0); + expected_v(jv, jk, jb) = static_cast<OutType>(0.0); + + for (int j = 0; j < rbf_vec_dim_v; ++j) { + int edge_idx = rbf_vec_idx_v_h[idx_at(j, jv, jb)]; + int edge_blk = rbf_vec_blk_v_h[blk_at(j, jv, jb)]; + InType coeff_u = rbf_vec_coeff_v_h[coeff_at(j, 0, jv, jb)]; + InType coeff_v = rbf_vec_coeff_v_h[coeff_at(j, 1, jv, jb)]; + + expected_u(jv, jk, jb) += + static_cast<OutType>(coeff_u * p_e_in_h[edge_at(edge_idx, jk, edge_blk)]); + expected_v(jv, jk, jb) += + static_cast<OutType>(coeff_v * p_e_in_h[edge_at(edge_idx, jk, edge_blk)]); + } + } + } + } + + OutType tol = std::is_same<OutType, float>::value ? + static_cast<OutType>(1e-5) : static_cast<OutType>(1e-13); + + // Verify results + for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) { + int i_startidx, i_endidx; + get_indices_v_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb, + this->i_startblk, this->i_endblk, i_startidx, i_endidx); + + for (int jk = this->slev; jk <= this->elev; ++jk) { + for (int jv = i_startidx; jv <= i_endidx; ++jv) { + EXPECT_NEAR(p_u_out_h[vert_at(jv, jk, jb)], + expected_u(jv, jk, jb), tol) + << "u failure at block " << jb << ", level " << jk << ", index " << jv; + EXPECT_NEAR(p_v_out_h[vert_at(jv, jk, jb)], + expected_v(jv, jk, jb), tol) + << "v failure at block " << jb << ", level " << jk << ", index " << jv; } } } diff --git a/test/c/test_tdma_solver.cpp b/test/c/test_tdma_solver.cpp index 4e09ff335368b2cbc532fd2ade2aee8a8259415d..bb1dddff1d19f895fc0498f5abec69f260b9758a 100644 --- a/test/c/test_tdma_solver.cpp +++ b/test/c/test_tdma_solver.cpp @@ -13,76 +13,341 @@ #include <vector> #include <algorithm> #include "mo_math_utilities.hpp" +#include "dim_helper.hpp" +#include <Kokkos_Core.hpp> +#include <random> -// Helper function to compute the 1D index for column-major storage. +// Helper function for column-major indexing +template <typename T> inline int idx(int i, int j, int nrows) { return i + j * nrows; } -// Test fixture for the TDMA solver tests. -class TDMASolverTestFixture : public ::testing::Test { +template <typename T> +class TDMASolverTypedTestFixture : public ::testing::Test { protected: - const int n = 10; // Matrix dimension. - std::vector<double> a; // Input matrix a. - std::vector<double> b; // Input matrix b. - std::vector<double> c; // Input matrix c. - std::vector<double> d; // Input matrix d. - std::vector<double> x; // Output matrix. - - TDMASolverTestFixture() - : a(n * n), b(n * n), c(n * n), d(n * n), x(n * n, 0.0) {} - - // SetUp is run before each test. - void SetUp() override { - // Fill arrays in column-major order. + const int n = 10; // Matrix dimension. + + // Using Kokkos execution and memory spaces + using exec_space = Kokkos::DefaultExecutionSpace; + using memory_space = exec_space::memory_space; + + // Kokkos Views for test data + Kokkos::View<T*, memory_space> a; // Input matrix a. + Kokkos::View<T*, memory_space> b; // Input matrix b. + Kokkos::View<T*, memory_space> c; // Input matrix c. + Kokkos::View<T*, memory_space> d; // Input matrix d. + Kokkos::View<T*, memory_space> x; // Output matrix. + + TDMASolverTypedTestFixture() + : a("a", n * n), + b("b", n * n), + c("c", n * n), + d("d", n * n), + x("x", n * n) + {} + + void SetUpSpecificTest() { + // Create host mirror views + auto a_h = Kokkos::create_mirror_view(a); + auto b_h = Kokkos::create_mirror_view(b); + auto c_h = Kokkos::create_mirror_view(c); + auto d_h = Kokkos::create_mirror_view(d); + auto x_h = Kokkos::create_mirror_view(x); + + // Fill arrays in column-major order with the specific test values for (int j = 0; j < n; j++) { for (int i = 0; i < n; i++) { - double value = (i + 1) + (j + 1); - a[idx(i, j, n)] = 1.0 * value; - b[idx(i, j, n)] = 2.0 * value; - c[idx(i, j, n)] = 1.0 * value; - d[idx(i, j, n)] = 1.0 * value; + T value = static_cast<T>((i + 1) + (j + 1)); + a_h[idx<T>(i, j, n)] = static_cast<T>(1.0) * value; + b_h[idx<T>(i, j, n)] = static_cast<T>(2.0) * value; + c_h[idx<T>(i, j, n)] = static_cast<T>(1.0) * value; + d_h[idx<T>(i, j, n)] = static_cast<T>(1.0) * value; + x_h[idx<T>(i, j, n)] = static_cast<T>(0.0); } } - // Clear the output vector. - std::fill(x.begin(), x.end(), 0.0); + + // Copy to device + Kokkos::deep_copy(a, a_h); + Kokkos::deep_copy(b, b_h); + Kokkos::deep_copy(c, c_h); + Kokkos::deep_copy(d, d_h); + Kokkos::deep_copy(x, x_h); } }; -TEST_F(TDMASolverTestFixture, FullTest) { - // Call the solver over the full range: - tdma_solver_vec<double>(a.data(), b.data(), c.data(), d.data(), - 0, n, 0, n, n, n, x.data()); +// Define the types we want to test with +typedef ::testing::Types<float, double> NumericTypes; +TYPED_TEST_SUITE(TDMASolverTypedTestFixture, NumericTypes); + +// Specific test for the full matrix +TYPED_TEST(TDMASolverTypedTestFixture, SpecificFull) { + const int n = this->n; + + // Set up the test with specific values + this->SetUpSpecificTest(); + + // Call the solver over the full range + tdma_solver_vec<TypeParam>( + this->a.data(), this->b.data(), this->c.data(), this->d.data(), + 0, n, 0, n, n, n, this->x.data()); + + // Copy results back to host + auto x_h = Kokkos::create_mirror_view(this->x); + Kokkos::deep_copy(x_h, this->x); - // Compute the sum of all elements in the output matrix. - double sum = 0.0; + // Compute the sum of all elements in the output matrix + TypeParam sum = 0.0; for (int j = 0; j < n; j++) { for (int i = 0; i < n; i++) { - sum += x[idx(i, j, n)]; + sum += x_h[idx<TypeParam>(i, j, n)]; } } - // Expected reference sum - double sum_ref = 27.2727272727272769; - double tol = 1e-13; + // Expected reference sum (adjusted for precision) + TypeParam sum_ref = static_cast<TypeParam>(27.2727272727272769); + TypeParam tol = std::is_same<TypeParam, float>::value ? + static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13); + EXPECT_NEAR(sum, sum_ref, tol); } -TEST_F(TDMASolverTestFixture, PartialTest) { - // Call the solver for a partial region: - // For C++: slev = 1, elev = n-1, startidx = 1, endidx = n-1. - tdma_solver_vec<double>(a.data(), b.data(), c.data(), d.data(), - 1, n - 1, 1, n - 1, n, n, x.data()); +// Specific test for a partial region +TYPED_TEST(TDMASolverTypedTestFixture, SpecificPartial) { + const int n = this->n; + + // Set up the test with specific values + this->SetUpSpecificTest(); + + // Call the solver for a partial region + tdma_solver_vec<TypeParam>( + this->a.data(), this->b.data(), this->c.data(), this->d.data(), + 1, n - 1, 1, n - 1, n, n, this->x.data()); + + // Copy results back to host + auto x_h = Kokkos::create_mirror_view(this->x); + Kokkos::deep_copy(x_h, this->x); // Compute the sum over a region - double sum = 0.0; + TypeParam sum = 0.0; for (int j = 1; j < n - 1; j++) { for (int i = 1; i < n - 1; i++) { - sum += x[idx(i, j, n)]; + sum += x_h[idx<TypeParam>(i, j, n)]; } } - double sum_ref = 17.7777777777777679; - double tol = 1e-13; + // Expected reference sum (adjusted for precision) + TypeParam sum_ref = static_cast<TypeParam>(17.7777777777777679); + TypeParam tol = std::is_same<TypeParam, float>::value ? + static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13); + EXPECT_NEAR(sum, sum_ref, tol); } + +// Random test for the full matrix +TYPED_TEST(TDMASolverTypedTestFixture, RandomFull) { + const int n = this->n; + + // Create host mirror views + auto a_h = Kokkos::create_mirror_view(this->a); + auto b_h = Kokkos::create_mirror_view(this->b); + auto c_h = Kokkos::create_mirror_view(this->c); + auto d_h = Kokkos::create_mirror_view(this->d); + auto x_h = Kokkos::create_mirror_view(this->x); + + // Use fixed seed for reproducibility + std::mt19937 gen(42); + std::uniform_real_distribution<double> diag_dist(5.0, 10.0); // For main diagonal + std::uniform_real_distribution<double> off_diag_dist(0.1, 2.0); // For off-diagonals + std::uniform_real_distribution<double> rhs_dist(-10.0, 10.0); // For right-hand side + + // Fill arrays with random values + for (int j = 0; j < n; j++) { + for (int i = 0; i < n; i++) { + a_h[idx<TypeParam>(i, j, n)] = static_cast<TypeParam>(-off_diag_dist(gen)); + b_h[idx<TypeParam>(i, j, n)] = static_cast<TypeParam>(diag_dist(gen)); + c_h[idx<TypeParam>(i, j, n)] = static_cast<TypeParam>(-off_diag_dist(gen)); + d_h[idx<TypeParam>(i, j, n)] = static_cast<TypeParam>(rhs_dist(gen)); + x_h[idx<TypeParam>(i, j, n)] = static_cast<TypeParam>(0.0); + } + } + + // Save copies for reference solution + std::vector<TypeParam> a_copy(n * n); + std::vector<TypeParam> b_copy(n * n); + std::vector<TypeParam> c_copy(n * n); + std::vector<TypeParam> d_copy(n * n); + std::vector<TypeParam> x_expected(n * n, 0.0); + + for (int j = 0; j < n; j++) { + for (int i = 0; i < n; i++) { + a_copy[idx<TypeParam>(i, j, n)] = a_h[idx<TypeParam>(i, j, n)]; + b_copy[idx<TypeParam>(i, j, n)] = b_h[idx<TypeParam>(i, j, n)]; + c_copy[idx<TypeParam>(i, j, n)] = c_h[idx<TypeParam>(i, j, n)]; + d_copy[idx<TypeParam>(i, j, n)] = d_h[idx<TypeParam>(i, j, n)]; + } + } + + // Copy to device + Kokkos::deep_copy(this->a, a_h); + Kokkos::deep_copy(this->b, b_h); + Kokkos::deep_copy(this->c, c_h); + Kokkos::deep_copy(this->d, d_h); + Kokkos::deep_copy(this->x, x_h); + + // Call the solver over the full range + tdma_solver_vec<TypeParam>( + this->a.data(), this->b.data(), this->c.data(), this->d.data(), + 0, n, 0, n, n, n, this->x.data()); + + // Copy results back to host + Kokkos::deep_copy(x_h, this->x); + + // Calculate reference solution + for (int i = 0; i < n; i++) { + // Arrays for internal calculations + std::vector<TypeParam> c_prime(n, 0.0); + std::vector<TypeParam> d_prime(n, 0.0); + + // Forward sweep + c_prime[0] = c_copy[idx<TypeParam>(i, 0, n)] / b_copy[idx<TypeParam>(i, 0, n)]; + d_prime[0] = d_copy[idx<TypeParam>(i, 0, n)] / b_copy[idx<TypeParam>(i, 0, n)]; + + for (int j = 1; j < n; j++) { + TypeParam m = static_cast<TypeParam>(1.0) / + (b_copy[idx<TypeParam>(i, j, n)] - c_prime[j-1] * a_copy[idx<TypeParam>(i, j, n)]); + c_prime[j] = c_copy[idx<TypeParam>(i, j, n)] * m; + d_prime[j] = (d_copy[idx<TypeParam>(i, j, n)] - d_prime[j-1] * a_copy[idx<TypeParam>(i, j, n)]) * m; + } + + // Back substitution + x_expected[idx<TypeParam>(i, n-1, n)] = d_prime[n-1]; + + for (int j = n-2; j >= 0; j--) { + x_expected[idx<TypeParam>(i, j, n)] = d_prime[j] - c_prime[j] * x_expected[idx<TypeParam>(i, j+1, n)]; + } + } + + // Set tolerance based on type + TypeParam tol = std::is_same<TypeParam, float>::value ? + static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13); + + // Verify that individual values match + for (int j = 0; j < n; j++) { + for (int i = 0; i < n; i++) { + EXPECT_NEAR(x_h[idx<TypeParam>(i, j, n)], x_expected[idx<TypeParam>(i, j, n)], tol) + << "Mismatch at i=" << i << ", j=" << j; + } + } +} + +// Random test for a partial region +TYPED_TEST(TDMASolverTypedTestFixture, RandomPartial) { + const int n = this->n; + const int slev = 1; + const int elev = n - 1; + const int startidx = 1; + const int endidx = n - 1; + + // Create host mirror views + auto a_h = Kokkos::create_mirror_view(this->a); + auto b_h = Kokkos::create_mirror_view(this->b); + auto c_h = Kokkos::create_mirror_view(this->c); + auto d_h = Kokkos::create_mirror_view(this->d); + auto x_h = Kokkos::create_mirror_view(this->x); + + // Use fixed seed for reproducibility + std::mt19937 gen(43); + std::uniform_real_distribution<double> diag_dist(5.0, 10.0); + std::uniform_real_distribution<double> off_diag_dist(0.1, 2.0); + std::uniform_real_distribution<double> rhs_dist(-10.0, 10.0); + + // Initialize all values to something that shouldn't be touched + for (int j = 0; j < n; j++) { + for (int i = 0; i < n; i++) { + a_h[idx<TypeParam>(i, j, n)] = static_cast<TypeParam>(-999.0); + b_h[idx<TypeParam>(i, j, n)] = static_cast<TypeParam>(-999.0); + c_h[idx<TypeParam>(i, j, n)] = static_cast<TypeParam>(-999.0); + d_h[idx<TypeParam>(i, j, n)] = static_cast<TypeParam>(-999.0); + x_h[idx<TypeParam>(i, j, n)] = static_cast<TypeParam>(0.0); + } + } + + // Set random values only for the region to be processed + for (int j = slev; j < elev; j++) { + for (int i = startidx; i < endidx; i++) { + a_h[idx<TypeParam>(i, j, n)] = static_cast<TypeParam>(-off_diag_dist(gen)); + b_h[idx<TypeParam>(i, j, n)] = static_cast<TypeParam>(diag_dist(gen)); + c_h[idx<TypeParam>(i, j, n)] = static_cast<TypeParam>(-off_diag_dist(gen)); + d_h[idx<TypeParam>(i, j, n)] = static_cast<TypeParam>(rhs_dist(gen)); + } + } + + // Save copies for reference solution + std::vector<TypeParam> a_copy(n * n, static_cast<TypeParam>(-999.0)); + std::vector<TypeParam> b_copy(n * n, static_cast<TypeParam>(-999.0)); + std::vector<TypeParam> c_copy(n * n, static_cast<TypeParam>(-999.0)); + std::vector<TypeParam> d_copy(n * n, static_cast<TypeParam>(-999.0)); + std::vector<TypeParam> x_expected(n * n, static_cast<TypeParam>(0.0)); + + for (int j = slev; j < elev; j++) { + for (int i = startidx; i < endidx; i++) { + a_copy[idx<TypeParam>(i, j, n)] = a_h[idx<TypeParam>(i, j, n)]; + b_copy[idx<TypeParam>(i, j, n)] = b_h[idx<TypeParam>(i, j, n)]; + c_copy[idx<TypeParam>(i, j, n)] = c_h[idx<TypeParam>(i, j, n)]; + d_copy[idx<TypeParam>(i, j, n)] = d_h[idx<TypeParam>(i, j, n)]; + } + } + + // Copy to device + Kokkos::deep_copy(this->a, a_h); + Kokkos::deep_copy(this->b, b_h); + Kokkos::deep_copy(this->c, c_h); + Kokkos::deep_copy(this->d, d_h); + Kokkos::deep_copy(this->x, x_h); + + // Call the solver for the partial region + tdma_solver_vec<TypeParam>( + this->a.data(), this->b.data(), this->c.data(), this->d.data(), + slev, elev, startidx, endidx, n, n, this->x.data()); + + // Copy results back to host + Kokkos::deep_copy(x_h, this->x); + + // Calculate reference solution for the partial region + for (int i = startidx; i < endidx; i++) { + // Arrays for internal calculations + std::vector<TypeParam> c_prime(n, 0.0); + std::vector<TypeParam> d_prime(n, 0.0); + + // Forward sweep + c_prime[slev] = c_copy[idx<TypeParam>(i, slev, n)] / b_copy[idx<TypeParam>(i, slev, n)]; + d_prime[slev] = d_copy[idx<TypeParam>(i, slev, n)] / b_copy[idx<TypeParam>(i, slev, n)]; + + for (int j = slev + 1; j < elev; j++) { + TypeParam m = static_cast<TypeParam>(1.0) / + (b_copy[idx<TypeParam>(i, j, n)] - c_prime[j-1] * a_copy[idx<TypeParam>(i, j, n)]); + c_prime[j] = c_copy[idx<TypeParam>(i, j, n)] * m; + d_prime[j] = (d_copy[idx<TypeParam>(i, j, n)] - d_prime[j-1] * a_copy[idx<TypeParam>(i, j, n)]) * m; + } + + // Back substitution + x_expected[idx<TypeParam>(i, elev-1, n)] = d_prime[elev-1]; + + for (int j = elev-2; j >= slev; j--) { + x_expected[idx<TypeParam>(i, j, n)] = d_prime[j] - c_prime[j] * x_expected[idx<TypeParam>(i, j+1, n)]; + } + } + + // Set tolerance based on type + TypeParam tol = std::is_same<TypeParam, float>::value ? + static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13); + + // Verify that individual values match + for (int j = slev; j < elev; j++) { + for (int i = startidx; i < endidx; i++) { + EXPECT_NEAR(x_h[idx<TypeParam>(i, j, n)], x_expected[idx<TypeParam>(i, j, n)], tol) + << "Mismatch at i=" << i << ", j=" << j; + } + } +}