From 9a77e6f1a4d798c2fce7b3746431fb0478d125d2 Mon Sep 17 00:00:00 2001
From: Pradipta Samanta <samanta@dkrz.de>
Date: Mon, 24 Mar 2025 21:53:24 +0100
Subject: [PATCH 01/34] made the cpp codes run on GPUs and modified the tests
 accordingly

removed some comments
---
 .gitlab-ci.yml                                |   5 +-
 .../mo_lib_interpolation_scalar.cpp           |   5 +-
 test/c/CMakeLists.txt                         |  12 +-
 test/c/test_horizontal_div.cpp                | 481 +++++++++++-------
 4 files changed, 304 insertions(+), 199 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 9837066..c12492b 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -65,9 +65,10 @@ nvhpc_cpu:
   variables:
     SCHEDULER_PARAMETERS: "$SLURM_OPTIONS_CPU $SLURM_NTASKS"
   script:
-    - module load git gcc/11.2.0-gcc-11.2.0 nvhpc/22.5-gcc-11.2.0
+    - module load git gcc/11.2.0-gcc-11.2.0 nvhpc/24.7-gcc-11.2.0
     - mkdir nvhpc_cpu
     - cd nvhpc_cpu
+    - export LD_LIBRARY_PATH=/sw/spack-levante/gcc-11.2.0-bcn7mb/lib64:$LD_LIBRARY_PATH
     - /sw/spack-levante/cmake-3.23.1-q5kzz6/bin/cmake .. -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_C_COMPILER=nvc -DCMAKE_Fortran_COMPILER=nvfortran
     - make VERBOSE=1
     - make test
@@ -111,7 +112,7 @@ OpenMP_gcc:
     - module load git gcc/11.2.0-gcc-11.2.0
     - mkdir openmp_gcc
     - cd openmp_gcc
-    - /sw/spack-levante/cmake-3.23.1-q5kzz6/bin/cmake .. -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_C_COMPILER=gcc -DCMAKE_Fortran_COMPILER=gfortran -DIM_ENABLE_OPENMP=ON
+    - /sw/spack-levante/cmake-3.23.1-q5kzz6/bin/cmake .. -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_C_COMPILER=gcc -DCMAKE_Fortran_COMPILER=gfortran -DIM_ENABLE_GPU=nvidia-sm80 -DIM_ENABLE_OPENMP=ON
     - make VERBOSE=1
     - make test
   tags:
diff --git a/src/interpolation/mo_lib_interpolation_scalar.cpp b/src/interpolation/mo_lib_interpolation_scalar.cpp
index 9e4e6c5..8910cb2 100644
--- a/src/interpolation/mo_lib_interpolation_scalar.cpp
+++ b/src/interpolation/mo_lib_interpolation_scalar.cpp
@@ -136,10 +136,7 @@ void cells2edges_scalar_lib(const T *p_cell_in, const int *edge_cell_idx,
               p_edge_out_view(je, jk, jb) = p_cell_in_view(
                   iidx_view(je, jb, 1), jk, iblk_view(je, jb, 1));
             } else {
-              std::cerr << "mo_interpolation:cells2edges_scalar_lib: error in "
-                           "lateral boundary filling"
-                        << std::endl;
-              std::exit(EXIT_FAILURE);
+              Kokkos::abort("mo_interpolation:cells2edges_scalar_lib: error in lateral boundary filling");
             }
           });
       Kokkos::fence();
diff --git a/test/c/CMakeLists.txt b/test/c/CMakeLists.txt
index 90ab1e3..0b42439 100644
--- a/test/c/CMakeLists.txt
+++ b/test/c/CMakeLists.txt
@@ -33,12 +33,12 @@ endif()
 set(SOURCES
   main.cpp
   test_horizontal_div.cpp
-  test_horizontal_recon.cpp
-  test_horizontal_rot.cpp
-  test_tdma_solver.cpp
-  test_interpolation_vector.cpp
-  test_intp_rbf.cpp
-  test_interpolation_scalar.cpp
+  # test_horizontal_recon.cpp
+  # test_horizontal_rot.cpp
+  # test_tdma_solver.cpp
+  # test_interpolation_vector.cpp
+  # test_intp_rbf.cpp
+  # test_interpolation_scalar.cpp
 )
 # Create the test executable from your test files, including main.cpp.
 add_executable(iconmath_test_c ${SOURCES})
diff --git a/test/c/test_horizontal_div.cpp b/test/c/test_horizontal_div.cpp
index 596d19e..2d06bc0 100644
--- a/test/c/test_horizontal_div.cpp
+++ b/test/c/test_horizontal_div.cpp
@@ -20,7 +20,8 @@
 #include <support/mo_lib_loopindices.hpp>
 
 /// Test class for the horizontal divergence tests. Templated for the ValueType
-template <typename ValueType> class HorizontalDivTest : public ::testing::Test {
+template <typename ValueType>
+class HorizontalDivTest : public ::testing::Test {
 protected:
   static constexpr int nproma = 3;  // inner loop length
   static constexpr int nlev = 2;    // number of vertical levels
@@ -36,43 +37,51 @@ protected:
   std::vector<int> elev;
   bool lacc = false; // Not using ACC-specific behavior.
 
-  std::vector<ValueType> vec_e;
-  std::vector<int> cell_edge_idx;
-  std::vector<int> cell_edge_blk;
-  std::vector<ValueType> geofac_div;
-  std::vector<ValueType> div_vec_c;
-  std::vector<ValueType> f4din;
-  std::vector<ValueType> f4dout;
+  // Here we allocate Kokkos::View objects in a memory space that is directly accessible
+  // from both the host and device
+  using exec_space = Kokkos::DefaultExecutionSpace;
+  using memory_space = exec_space::memory_space;
 
-  // Followings are needed in HorizontalDivAvgTest
-  std::vector<int> cell_neighbor_idx;
-  std::vector<int> cell_neighbor_blk;
-  std::vector<ValueType> avg_coeff;
-  std::vector<ValueType> opt_in2;
-  std::vector<ValueType> opt_out2;
+  // Views for the test data. All the data is assigned as one-dimensional arrays
+  Kokkos::View<ValueType*, memory_space> vec_e;
+  Kokkos::View<int*, memory_space> cell_edge_idx;
+  Kokkos::View<int*, memory_space> cell_edge_blk;
+  Kokkos::View<ValueType*, memory_space> geofac_div;
+  Kokkos::View<ValueType*, memory_space> div_vec_c;
+  Kokkos::View<ValueType*, memory_space> f4din;
+  Kokkos::View<ValueType*, memory_space> f4dout;
 
-  HorizontalDivTest() {
+  // Followings are needed in HorizontalDivAvgTest
+  Kokkos::View<int*, memory_space> cell_neighbor_idx;
+  Kokkos::View<int*, memory_space> cell_neighbor_blk;
+  Kokkos::View<ValueType*, memory_space> avg_coeff;
+  Kokkos::View<ValueType*, memory_space> opt_in2;
+  Kokkos::View<ValueType*, memory_space> opt_out2;
+
+  HorizontalDivTest()
+      : vec_e("vec_e", dim_combine(nproma, nlev, nblks_e)),
+        cell_edge_idx("cell_edge_idx", dim_combine(nproma, nblks_c, 3)),
+        cell_edge_blk("cell_edge_blk", dim_combine(nproma, nblks_c, 3)),
+        geofac_div("geofac_div", dim_combine(nproma, 3, nblks_c)),
+        div_vec_c("div_vec_c", dim_combine(nproma, nlev, nblks_c)),
+        f4din("f4din", dim_combine(nproma, nlev, nblks_e, dim4d)),
+        f4dout("f4dout", dim_combine(nproma, nlev, nblks_c, dim4d)),
+        cell_neighbor_idx("cell_neighbor_idx", dim_combine(nproma, nblks_c, 3)),
+        cell_neighbor_blk("cell_neighbor_blk", dim_combine(nproma, nblks_c, 3)),
+        avg_coeff("avg_coeff", dim_combine(nproma, nlev, nblks_c)),
+        opt_in2("opt_in2", dim_combine(nproma, nlev, nblks_e)),
+        opt_out2("opt_out2", dim_combine(nproma, nlev, nblks_c))
+  {
+
+    // We keep slev and elev as std::vector since they are small and used only on the host.
     slev.resize(dim4d, 0);
     elev.resize(dim4d, nlev); // Full vertical range (0 .. nlev-1)
 
-    vec_e.resize(dim_combine(nproma, nlev, nblks_e));
-    cell_edge_idx.resize(dim_combine(nproma, nblks_c, 3));
-    cell_edge_blk.resize(dim_combine(nproma, nblks_c, 3));
-    geofac_div.resize(dim_combine(nproma, 3, nblks_c));
-    div_vec_c.resize(dim_combine(nproma, nlev, nblks_c));
-    f4din.resize(dim_combine(nproma, nlev, nblks_e, dim4d));
-    f4dout.resize(dim_combine(nproma, nlev, nblks_c, dim4d));
-    cell_neighbor_idx.resize(dim_combine(nproma, nblks_c, 3));
-    cell_neighbor_blk.resize(dim_combine(nproma, nblks_c, 3));
-    avg_coeff.resize(dim_combine(nproma, 4, nblks_c));
-    opt_in2.resize(dim_combine(nproma, nlev, nblks_e));
-    opt_out2.resize(dim_combine(nproma, nlev, nblks_c));
   }
 };
 
 /// ValueTypes which the divrot tests should run with
 typedef ::testing::Types<float, double> ValueTypes;
-
 TYPED_TEST_SUITE(HorizontalDivTest, ValueTypes);
 
 TYPED_TEST(HorizontalDivTest, TestDiv3DSpecific) {
@@ -86,34 +95,46 @@ TYPED_TEST(HorizontalDivTest, TestDiv3DSpecific) {
   const auto &geofac_div_at = at<nproma, 3, nblks_c>;
   const auto &div_vec_c_at = at<nproma, nlev, nblks_c>;
 
-  // Initialization with specific values
+  // create mirror views to store data on the host
+  auto vec_e_h = Kokkos::create_mirror_view(this->vec_e);
+  auto cell_edge_idx_h = Kokkos::create_mirror_view(this->cell_edge_idx);
+  auto cell_edge_blk_h = Kokkos::create_mirror_view(this->cell_edge_blk);
+  auto geofac_div_h = Kokkos::create_mirror_view(this->geofac_div);
+  auto div_vec_c_h = Kokkos::create_mirror_view(this->div_vec_c);
+
+  // Initialize the arrays with the same patterns as before.
   for (int i = 0; i < nproma; ++i) {
     for (int k = 0; k < nlev; ++k) {
-      this->vec_e[vec_e_at(i, k, 0)] = (i + 1) * (k + 1); // Simple pattern
+      vec_e_h[vec_e_at(i, k, 0)] = static_cast<TypeParam>((i + 1) * (k + 1));
     }
-
     // Set edge indices to point to specific cells (including self)
-    this->cell_edge_idx[cell_edge_at(i, 0, 0)] = i;
-    this->cell_edge_idx[cell_edge_at(i, 0, 1)] = (i + 1) % nproma;
-    this->cell_edge_idx[cell_edge_at(i, 0, 2)] = (i + 2) % nproma;
+    cell_edge_idx_h[cell_edge_at(i, 0, 0)] = i;
+    cell_edge_idx_h[cell_edge_at(i, 0, 1)] = (i + 1) % nproma;
+    cell_edge_idx_h[cell_edge_at(i, 0, 2)] = (i + 2) % nproma;
 
     // All edges are in the same block for this test
     for (int j = 0; j < 3; ++j) {
-      this->cell_edge_blk[cell_edge_at(i, 0, j)] = 0;
+      cell_edge_blk_h[cell_edge_at(i, 0, j)] = 0;
     }
 
-    // Geometric factors
-    this->geofac_div[geofac_div_at(i, 0, 0)] = 0.5;
-    this->geofac_div[geofac_div_at(i, 1, 0)] = 0.3;
-    this->geofac_div[geofac_div_at(i, 2, 0)] = 0.2;
+    geofac_div_h[geofac_div_at(i, 0, 0)] = static_cast<TypeParam>(0.5);
+    geofac_div_h[geofac_div_at(i, 1, 0)] = static_cast<TypeParam>(0.3);
+    geofac_div_h[geofac_div_at(i, 2, 0)] = static_cast<TypeParam>(0.2);
 
     // Initialize div_vec_c to zero
     for (int k = 0; k < nlev; ++k) {
-      this->div_vec_c[div_vec_c_at(i, k, 0)] = 0.0;
+      div_vec_c_h[div_vec_c_at(i, k, 0)] = static_cast<TypeParam>(0.0);
     }
   }
-
-  // Call the div3d function
+  // Copy the initialized data back to the device memory (or unified memory, which in some
+  // cases may be a no-op if already accessible on the host).
+  Kokkos::deep_copy(this->vec_e, vec_e_h);
+  Kokkos::deep_copy(this->cell_edge_idx, cell_edge_idx_h);
+  Kokkos::deep_copy(this->cell_edge_blk, cell_edge_blk_h);
+  Kokkos::deep_copy(this->geofac_div, geofac_div_h);
+  Kokkos::deep_copy(this->div_vec_c, div_vec_c_h);
+
+  // Call the div3d function using the device pointers from the Views.
   div3d<TypeParam>(this->vec_e.data(), this->cell_edge_idx.data(),
                    this->cell_edge_blk.data(), this->geofac_div.data(),
                    this->div_vec_c.data(), this->i_startblk, this->i_endblk,
@@ -121,12 +142,14 @@ TYPED_TEST(HorizontalDivTest, TestDiv3DSpecific) {
                    this->elev[0], this->nproma, this->lacc, this->nlev,
                    this->nblks_c, this->nblks_e);
 
-  EXPECT_NEAR(this->div_vec_c[div_vec_c_at(0, 0, 0)], 1.7, 1e-6);
-  EXPECT_NEAR(this->div_vec_c[div_vec_c_at(0, 1, 0)], 3.4, 1e-6);
-  EXPECT_NEAR(this->div_vec_c[div_vec_c_at(1, 0, 0)], 2.1, 1e-6);
-  EXPECT_NEAR(this->div_vec_c[div_vec_c_at(1, 1, 0)], 4.2, 1e-6);
-  EXPECT_NEAR(this->div_vec_c[div_vec_c_at(2, 0, 0)], 2.2, 1e-6);
-  EXPECT_NEAR(this->div_vec_c[div_vec_c_at(2, 1, 0)], 4.4, 1e-6);
+  Kokkos::deep_copy(div_vec_c_h, this->div_vec_c);
+
+  EXPECT_NEAR(div_vec_c_h(div_vec_c_at(0, 0, 0)), static_cast<TypeParam>(1.7), 1e-6);
+  EXPECT_NEAR(div_vec_c_h(div_vec_c_at(0, 1, 0)), static_cast<TypeParam>(3.4), 1e-6);
+  EXPECT_NEAR(div_vec_c_h(div_vec_c_at(1, 0, 0)), static_cast<TypeParam>(2.1), 1e-6);
+  EXPECT_NEAR(div_vec_c_h(div_vec_c_at(1, 1, 0)), static_cast<TypeParam>(4.2), 1e-6);
+  EXPECT_NEAR(div_vec_c_h(div_vec_c_at(2, 0, 0)), static_cast<TypeParam>(2.2), 1e-6);
+  EXPECT_NEAR(div_vec_c_h(div_vec_c_at(2, 1, 0)), static_cast<TypeParam>(4.4), 1e-6);
 }
 
 TYPED_TEST(HorizontalDivTest, TestDiv3DRandom) {
@@ -140,43 +163,52 @@ TYPED_TEST(HorizontalDivTest, TestDiv3DRandom) {
   const auto &geofac_div_at = at<nproma, 3, nblks_c>;
   const auto &div_vec_c_at = at<nproma, nlev, nblks_c>;
 
-  // Set up random number generators
+  //create mirror views to store data on the host
+  auto vec_e_h = Kokkos::create_mirror_view(this->vec_e);
+  auto cell_edge_idx_h = Kokkos::create_mirror_view(this->cell_edge_idx);
+  auto cell_edge_blk_h = Kokkos::create_mirror_view(this->cell_edge_blk);
+  auto geofac_div_h = Kokkos::create_mirror_view(this->geofac_div);
+  auto div_vec_c_h = Kokkos::create_mirror_view(this->div_vec_c);
+
+  // Initialize the arrays with random values.
   std::random_device rd;
   std::mt19937 gen(rd());
-  std::uniform_int_distribution<int> int_distrib(0, nproma - 1);
   std::uniform_real_distribution<TypeParam> real_distrib(-10.0, 10.0);
+  std::uniform_int_distribution<int> int_distrib(0, nproma - 1);
 
-  // Initialization with random values
   for (int i = 0; i < nproma; ++i) {
     for (int k = 0; k < nlev; ++k) {
-      this->vec_e[vec_e_at(i, k, 0)] = real_distrib(gen);
+      vec_e_h[vec_e_at(i, k, 0)] = real_distrib(gen);
     }
 
-    // Set random edge indices
     for (int j = 0; j < 3; ++j) {
-      this->cell_edge_idx[cell_edge_at(i, 0, j)] = int_distrib(gen);
-      this->cell_edge_blk[cell_edge_at(i, 0, j)] =
-          0; // Keep in same block for simplicity
+      cell_edge_idx_h[cell_edge_at(i, 0, j)] = int_distrib(gen);
+      cell_edge_blk_h[cell_edge_at(i, 0, j)] = 0;
     }
 
-    // Random geometric factors
     for (int j = 0; j < 3; ++j) {
-      this->geofac_div[geofac_div_at(i, j, 0)] = real_distrib(gen);
+      geofac_div_h[geofac_div_at(i, j, 0)] = real_distrib(gen);
     }
 
-    // Initialize div_vec_c to random values
     for (int k = 0; k < nlev; ++k) {
-      this->div_vec_c[div_vec_c_at(i, k, 0)] = real_distrib(gen);
+      div_vec_c_h[div_vec_c_at(i, k, 0)] = real_distrib(gen);
     }
   }
 
-  // Call the div3d function
-  div3d<TypeParam>(this->vec_e.data(), this->cell_edge_idx.data(),
-                   this->cell_edge_blk.data(), this->geofac_div.data(),
-                   this->div_vec_c.data(), this->i_startblk, this->i_endblk,
-                   this->i_startidx_in, this->i_endidx_in, this->slev[0],
-                   this->elev[0], this->nproma, this->lacc, this->nlev,
-                   this->nblks_c, this->nblks_e);
+  Kokkos::deep_copy(this->vec_e, vec_e_h);
+  Kokkos::deep_copy(this->cell_edge_idx, cell_edge_idx_h);
+  Kokkos::deep_copy(this->cell_edge_blk, cell_edge_blk_h);
+  Kokkos::deep_copy(this->geofac_div, geofac_div_h);
+  Kokkos::deep_copy(this->div_vec_c, div_vec_c_h);
+
+  div3d<TypeParam>(
+      this->vec_e.data(), this->cell_edge_idx.data(), this->cell_edge_blk.data(),
+      this->geofac_div.data(), this->div_vec_c.data(), this->i_startblk,
+      this->i_endblk, this->i_startidx_in, this->i_endidx_in, this->slev[0],
+      this->elev[0], this->nproma, this->lacc, this->nlev, this->nblks_c,
+      this->nblks_e);
+
+  Kokkos::deep_copy(div_vec_c_h, this->div_vec_c);
 
   // Calculate reference values separately and verify results
   std::vector<TypeParam> ref_div_vec_c(nproma * nlev * nblks_c, 0.0);
@@ -189,18 +221,18 @@ TYPED_TEST(HorizontalDivTest, TestDiv3DRandom) {
     for (int jk = this->slev[0]; jk < this->elev[0]; ++jk) {
       for (int jc = i_startidx; jc < i_endidx; ++jc) {
         ref_div_vec_c[div_vec_c_at(jc, jk, jb)] =
-            this->vec_e[vec_e_at(
-                this->cell_edge_idx[cell_edge_at(jc, jb, 0)], jk,
-                this->cell_edge_blk[cell_edge_at(jc, jb, 0)])] *
-                this->geofac_div[geofac_div_at(jc, 0, jb)] +
-            this->vec_e[vec_e_at(
-                this->cell_edge_idx[cell_edge_at(jc, jb, 1)], jk,
-                this->cell_edge_blk[cell_edge_at(jc, jb, 1)])] *
-                this->geofac_div[geofac_div_at(jc, 1, jb)] +
-            this->vec_e[vec_e_at(
-                this->cell_edge_idx[cell_edge_at(jc, jb, 2)], jk,
-                this->cell_edge_blk[cell_edge_at(jc, jb, 2)])] *
-                this->geofac_div[geofac_div_at(jc, 2, jb)];
+            vec_e_h[vec_e_at(
+                cell_edge_idx_h[cell_edge_at(jc, jb, 0)], jk,
+                cell_edge_blk_h[cell_edge_at(jc, jb, 0)])] *
+                geofac_div_h[geofac_div_at(jc, 0, jb)] +
+            vec_e_h[vec_e_at(
+                cell_edge_idx_h[cell_edge_at(jc, jb, 1)], jk,
+                cell_edge_blk_h[cell_edge_at(jc, jb, 1)])] *
+                geofac_div_h[geofac_div_at(jc, 1, jb)] +
+            vec_e_h[vec_e_at(
+                cell_edge_idx_h[cell_edge_at(jc, jb, 2)], jk,
+                cell_edge_blk_h[cell_edge_at(jc, jb, 2)])] *
+                geofac_div_h[geofac_div_at(jc, 2, jb)];
       }
     }
   }
@@ -208,7 +240,7 @@ TYPED_TEST(HorizontalDivTest, TestDiv3DRandom) {
   // Verify results
   for (int i = 0; i < nproma; ++i) {
     for (int k = 0; k < nlev; ++k) {
-      EXPECT_NEAR(this->div_vec_c[div_vec_c_at(i, k, 0)],
+      EXPECT_NEAR(div_vec_c_h[div_vec_c_at(i, k, 0)],
                   ref_div_vec_c[div_vec_c_at(i, k, 0)], 1e-5)
           << "Results differ at i=" << i << ", k=" << k;
     }
@@ -229,36 +261,53 @@ TYPED_TEST(HorizontalDivTest, TestDiv3D2FSpecific) {
   const auto &f4d_at = at<nproma, nlev, nblks_e, dim4d>;
   const auto &f4dout_at = at<nproma, nlev, nblks_c, dim4d>;
 
+  // Create mirror views to store data on the host
+  auto vec_e_h = Kokkos::create_mirror_view(this->vec_e);
+  auto cell_edge_idx_h = Kokkos::create_mirror_view(this->cell_edge_idx);
+  auto cell_edge_blk_h = Kokkos::create_mirror_view(this->cell_edge_blk);
+  auto geofac_div_h = Kokkos::create_mirror_view(this->geofac_div);
+  auto div_vec_c_h = Kokkos::create_mirror_view(this->div_vec_c);
+  auto f4din_h = Kokkos::create_mirror_view(this->f4din);
+  auto f4dout_h = Kokkos::create_mirror_view(this->f4dout);
+
   // Initialization with specific values
   for (int i = 0; i < nproma; ++i) {
     for (int k = 0; k < nlev; ++k) {
-      this->vec_e[vec_e_at(i, k, 0)] = (i + 1) * (k + 1); // Simple pattern
-      this->f4din[f4d_at(i, k, 0, 0)] =
-          (i + 1) * (k + 2); // Different pattern for second field
+      vec_e_h[vec_e_at(i, k, 0)] = static_cast<TypeParam>((i + 1) * (k + 1)); // Simple pattern
+      f4din_h[f4d_at(i, k, 0, 0)] = static_cast<TypeParam>((i + 1) * (k + 2)); // Different pattern for second field
     }
 
     // Set edge indices to point to specific cells (including self)
-    this->cell_edge_idx[cell_edge_at(i, 0, 0)] = i;
-    this->cell_edge_idx[cell_edge_at(i, 0, 1)] = (i + 1) % nproma;
-    this->cell_edge_idx[cell_edge_at(i, 0, 2)] = (i + 2) % nproma;
+    cell_edge_idx_h[cell_edge_at(i, 0, 0)] = i;
+    cell_edge_idx_h[cell_edge_at(i, 0, 1)] = (i + 1) % nproma;
+    cell_edge_idx_h[cell_edge_at(i, 0, 2)] = (i + 2) % nproma;
 
     // All edges are in the same block for this test
     for (int j = 0; j < 3; ++j) {
-      this->cell_edge_blk[cell_edge_at(i, 0, j)] = 0;
+      cell_edge_blk_h[cell_edge_at(i, 0, j)] = 0;
     }
 
     // Geometric factors
-    this->geofac_div[geofac_div_at(i, 0, 0)] = 0.5;
-    this->geofac_div[geofac_div_at(i, 1, 0)] = 0.3;
-    this->geofac_div[geofac_div_at(i, 2, 0)] = 0.2;
+    geofac_div_h[geofac_div_at(i, 0, 0)] = static_cast<TypeParam>(0.5);
+    geofac_div_h[geofac_div_at(i, 1, 0)] = static_cast<TypeParam>(0.3);
+    geofac_div_h[geofac_div_at(i, 2, 0)] = static_cast<TypeParam>(0.2);
 
     // Initialize div_vec_c and f4dout to zero
     for (int k = 0; k < nlev; ++k) {
-      this->div_vec_c[div_vec_c_at(i, k, 0)] = 0.0;
-      this->f4dout[f4dout_at(i, k, 0, 0)] = 0.0;
+      div_vec_c_h[div_vec_c_at(i, k, 0)] = static_cast<TypeParam>(0.0);
+      f4dout_h[f4dout_at(i, k, 0, 0)] = static_cast<TypeParam>(0.0);
     }
   }
 
+  // Copy initialized data to device
+  Kokkos::deep_copy(this->vec_e, vec_e_h);
+  Kokkos::deep_copy(this->cell_edge_idx, cell_edge_idx_h);
+  Kokkos::deep_copy(this->cell_edge_blk, cell_edge_blk_h);
+  Kokkos::deep_copy(this->geofac_div, geofac_div_h);
+  Kokkos::deep_copy(this->div_vec_c, div_vec_c_h);
+  Kokkos::deep_copy(this->f4din, f4din_h);
+  Kokkos::deep_copy(this->f4dout, f4dout_h);
+
   // Call the div3d_2field function
   div3d_2field<TypeParam>(this->vec_e.data(), this->cell_edge_idx.data(),
                           this->cell_edge_blk.data(), this->geofac_div.data(),
@@ -268,21 +317,25 @@ TYPED_TEST(HorizontalDivTest, TestDiv3D2FSpecific) {
                           this->elev[0], this->nproma, this->lacc, this->nlev,
                           this->nblks_c, this->nblks_e);
 
+  // Copy results back to host for verification
+  Kokkos::deep_copy(div_vec_c_h, this->div_vec_c);
+  Kokkos::deep_copy(f4dout_h, this->f4dout);
+
   // Check first field (same as in div3d test)
-  EXPECT_NEAR(this->div_vec_c[div_vec_c_at(0, 0, 0)], 1.7, 1e-6);
-  EXPECT_NEAR(this->div_vec_c[div_vec_c_at(0, 1, 0)], 3.4, 1e-6);
-  EXPECT_NEAR(this->div_vec_c[div_vec_c_at(1, 0, 0)], 2.1, 1e-6);
-  EXPECT_NEAR(this->div_vec_c[div_vec_c_at(1, 1, 0)], 4.2, 1e-6);
-  EXPECT_NEAR(this->div_vec_c[div_vec_c_at(2, 0, 0)], 2.2, 1e-6);
-  EXPECT_NEAR(this->div_vec_c[div_vec_c_at(2, 1, 0)], 4.4, 1e-6);
+  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(0, 0, 0)], static_cast<TypeParam>(1.7), 1e-6);
+  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(0, 1, 0)], static_cast<TypeParam>(3.4), 1e-6);
+  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(1, 0, 0)], static_cast<TypeParam>(2.1), 1e-6);
+  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(1, 1, 0)], static_cast<TypeParam>(4.2), 1e-6);
+  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(2, 0, 0)], static_cast<TypeParam>(2.2), 1e-6);
+  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(2, 1, 0)], static_cast<TypeParam>(4.4), 1e-6);
 
   // Check second field (expected values calculated manually)
-  EXPECT_NEAR(this->f4dout[f4dout_at(0, 0, 0, 0)], 3.4, 1e-6);
-  EXPECT_NEAR(this->f4dout[f4dout_at(0, 1, 0, 0)], 5.1, 1e-6);
-  EXPECT_NEAR(this->f4dout[f4dout_at(1, 0, 0, 0)], 4.2, 1e-6);
-  EXPECT_NEAR(this->f4dout[f4dout_at(1, 1, 0, 0)], 6.3, 1e-6);
-  EXPECT_NEAR(this->f4dout[f4dout_at(2, 0, 0, 0)], 4.4, 1e-6);
-  EXPECT_NEAR(this->f4dout[f4dout_at(2, 1, 0, 0)], 6.6, 1e-6);
+  EXPECT_NEAR(f4dout_h[f4dout_at(0, 0, 0, 0)], static_cast<TypeParam>(3.4), 1e-6);
+  EXPECT_NEAR(f4dout_h[f4dout_at(0, 1, 0, 0)], static_cast<TypeParam>(5.1), 1e-6);
+  EXPECT_NEAR(f4dout_h[f4dout_at(1, 0, 0, 0)], static_cast<TypeParam>(4.2), 1e-6);
+  EXPECT_NEAR(f4dout_h[f4dout_at(1, 1, 0, 0)], static_cast<TypeParam>(6.3), 1e-6);
+  EXPECT_NEAR(f4dout_h[f4dout_at(2, 0, 0, 0)], static_cast<TypeParam>(4.4), 1e-6);
+  EXPECT_NEAR(f4dout_h[f4dout_at(2, 1, 0, 0)], static_cast<TypeParam>(6.6), 1e-6);
 }
 
 TYPED_TEST(HorizontalDivTest, TestDiv3D2FRandom) {
@@ -299,6 +352,15 @@ TYPED_TEST(HorizontalDivTest, TestDiv3D2FRandom) {
   const auto &f4d_at = at<nproma, nlev, nblks_e, dim4d>;
   const auto &f4dout_at = at<nproma, nlev, nblks_c, dim4d>;
 
+  // Create mirror views to store data on the host
+  auto vec_e_h = Kokkos::create_mirror_view(this->vec_e);
+  auto cell_edge_idx_h = Kokkos::create_mirror_view(this->cell_edge_idx);
+  auto cell_edge_blk_h = Kokkos::create_mirror_view(this->cell_edge_blk);
+  auto geofac_div_h = Kokkos::create_mirror_view(this->geofac_div);
+  auto div_vec_c_h = Kokkos::create_mirror_view(this->div_vec_c);
+  auto f4din_h = Kokkos::create_mirror_view(this->f4din);
+  auto f4dout_h = Kokkos::create_mirror_view(this->f4dout);
+
   // Set up random number generators
   std::random_device rd;
   std::mt19937 gen(rd());
@@ -308,37 +370,48 @@ TYPED_TEST(HorizontalDivTest, TestDiv3D2FRandom) {
   // Initialization with random values
   for (int i = 0; i < nproma; ++i) {
     for (int k = 0; k < nlev; ++k) {
-      this->vec_e[vec_e_at(i, k, 0)] = real_distrib(gen);
-      this->f4din[f4d_at(i, k, 0, 0)] = real_distrib(gen);
+      vec_e_h[vec_e_at(i, k, 0)] = real_distrib(gen);
+      f4din_h[f4d_at(i, k, 0, 0)] = real_distrib(gen);
     }
 
     // Set random edge indices
     for (int j = 0; j < 3; ++j) {
-      this->cell_edge_idx[cell_edge_at(i, 0, j)] = int_distrib(gen);
-      this->cell_edge_blk[cell_edge_at(i, 0, j)] =
-          0; // Keep in same block for simplicity
+      cell_edge_idx_h[cell_edge_at(i, 0, j)] = int_distrib(gen);
+      cell_edge_blk_h[cell_edge_at(i, 0, j)] = 0; // Keep in same block for simplicity
     }
 
     // Random geometric factors
     for (int j = 0; j < 3; ++j) {
-      this->geofac_div[geofac_div_at(i, j, 0)] = real_distrib(gen);
+      geofac_div_h[geofac_div_at(i, j, 0)] = real_distrib(gen);
     }
 
     // Initialize div_vec_c and f4dout to random values
     for (int k = 0; k < nlev; ++k) {
-      this->div_vec_c[div_vec_c_at(i, k, 0)] = real_distrib(gen);
-      this->f4dout[f4dout_at(i, k, 0, 0)] = real_distrib(gen);
+      div_vec_c_h[div_vec_c_at(i, k, 0)] = real_distrib(gen);
+      f4dout_h[f4dout_at(i, k, 0, 0)] = real_distrib(gen);
     }
   }
 
+  // Copy initialized data to device
+  Kokkos::deep_copy(this->vec_e, vec_e_h);
+  Kokkos::deep_copy(this->cell_edge_idx, cell_edge_idx_h);
+  Kokkos::deep_copy(this->cell_edge_blk, cell_edge_blk_h);
+  Kokkos::deep_copy(this->geofac_div, geofac_div_h);
+  Kokkos::deep_copy(this->div_vec_c, div_vec_c_h);
+  Kokkos::deep_copy(this->f4din, f4din_h);
+  Kokkos::deep_copy(this->f4dout, f4dout_h);
+
   // Call the div3d_2field function
-  div3d_2field<TypeParam>(this->vec_e.data(), this->cell_edge_idx.data(),
-                          this->cell_edge_blk.data(), this->geofac_div.data(),
-                          this->div_vec_c.data(), this->f4din.data(),
-                          this->f4dout.data(), this->i_startblk, this->i_endblk,
-                          this->i_startidx_in, this->i_endidx_in, this->slev[0],
-                          this->elev[0], this->nproma, this->lacc, this->nlev,
-                          this->nblks_c, this->nblks_e);
+  div3d_2field<TypeParam>(
+      this->vec_e.data(), this->cell_edge_idx.data(), this->cell_edge_blk.data(),
+      this->geofac_div.data(), this->div_vec_c.data(), this->f4din.data(),
+      this->f4dout.data(), this->i_startblk, this->i_endblk, this->i_startidx_in,
+      this->i_endidx_in, this->slev[0], this->elev[0], this->nproma, this->lacc,
+      this->nlev, this->nblks_c, this->nblks_e);
+
+  // Copy results back to host for verification
+  Kokkos::deep_copy(div_vec_c_h, this->div_vec_c);
+  Kokkos::deep_copy(f4dout_h, this->f4dout);
 
   // Calculate reference values separately and verify results
   std::vector<TypeParam> ref_div_vec_c(nproma * nlev * nblks_c, 0.0);
@@ -353,33 +426,33 @@ TYPED_TEST(HorizontalDivTest, TestDiv3D2FRandom) {
       for (int jc = i_startidx; jc < i_endidx; ++jc) {
         // Calculate reference value for first field
         ref_div_vec_c[div_vec_c_at(jc, jk, jb)] =
-            this->vec_e[vec_e_at(
-                this->cell_edge_idx[cell_edge_at(jc, jb, 0)], jk,
-                this->cell_edge_blk[cell_edge_at(jc, jb, 0)])] *
-                this->geofac_div[geofac_div_at(jc, 0, jb)] +
-            this->vec_e[vec_e_at(
-                this->cell_edge_idx[cell_edge_at(jc, jb, 1)], jk,
-                this->cell_edge_blk[cell_edge_at(jc, jb, 1)])] *
-                this->geofac_div[geofac_div_at(jc, 1, jb)] +
-            this->vec_e[vec_e_at(
-                this->cell_edge_idx[cell_edge_at(jc, jb, 2)], jk,
-                this->cell_edge_blk[cell_edge_at(jc, jb, 2)])] *
-                this->geofac_div[geofac_div_at(jc, 2, jb)];
+            vec_e_h[vec_e_at(
+                cell_edge_idx_h[cell_edge_at(jc, jb, 0)], jk,
+                cell_edge_blk_h[cell_edge_at(jc, jb, 0)])] *
+                geofac_div_h[geofac_div_at(jc, 0, jb)] +
+            vec_e_h[vec_e_at(
+                cell_edge_idx_h[cell_edge_at(jc, jb, 1)], jk,
+                cell_edge_blk_h[cell_edge_at(jc, jb, 1)])] *
+                geofac_div_h[geofac_div_at(jc, 1, jb)] +
+            vec_e_h[vec_e_at(
+                cell_edge_idx_h[cell_edge_at(jc, jb, 2)], jk,
+                cell_edge_blk_h[cell_edge_at(jc, jb, 2)])] *
+                geofac_div_h[geofac_div_at(jc, 2, jb)];
 
         // Calculate reference value for second field
         ref_f4dout[f4dout_at(jc, jk, jb, 0)] =
-            this->f4din[f4d_at(this->cell_edge_idx[cell_edge_at(jc, jb, 0)], jk,
-                               this->cell_edge_blk[cell_edge_at(jc, jb, 0)],
-                               0)] *
-                this->geofac_div[geofac_div_at(jc, 0, jb)] +
-            this->f4din[f4d_at(this->cell_edge_idx[cell_edge_at(jc, jb, 1)], jk,
-                               this->cell_edge_blk[cell_edge_at(jc, jb, 1)],
-                               0)] *
-                this->geofac_div[geofac_div_at(jc, 1, jb)] +
-            this->f4din[f4d_at(this->cell_edge_idx[cell_edge_at(jc, jb, 2)], jk,
-                               this->cell_edge_blk[cell_edge_at(jc, jb, 2)],
-                               0)] *
-                this->geofac_div[geofac_div_at(jc, 2, jb)];
+            f4din_h[f4d_at(
+                cell_edge_idx_h[cell_edge_at(jc, jb, 0)], jk,
+                cell_edge_blk_h[cell_edge_at(jc, jb, 0)], 0)] *
+                geofac_div_h[geofac_div_at(jc, 0, jb)] +
+            f4din_h[f4d_at(
+                cell_edge_idx_h[cell_edge_at(jc, jb, 1)], jk,
+                cell_edge_blk_h[cell_edge_at(jc, jb, 1)], 0)] *
+                geofac_div_h[geofac_div_at(jc, 1, jb)] +
+            f4din_h[f4d_at(
+                cell_edge_idx_h[cell_edge_at(jc, jb, 2)], jk,
+                cell_edge_blk_h[cell_edge_at(jc, jb, 2)], 0)] *
+                geofac_div_h[geofac_div_at(jc, 2, jb)];
       }
     }
   }
@@ -387,7 +460,7 @@ TYPED_TEST(HorizontalDivTest, TestDiv3D2FRandom) {
   // Verify results for first field
   for (int i = 0; i < nproma; ++i) {
     for (int k = 0; k < nlev; ++k) {
-      EXPECT_NEAR(this->div_vec_c[div_vec_c_at(i, k, 0)],
+      EXPECT_NEAR(div_vec_c_h[div_vec_c_at(i, k, 0)],
                   ref_div_vec_c[div_vec_c_at(i, k, 0)], 1e-5)
           << "First field results differ at i=" << i << ", k=" << k;
     }
@@ -396,7 +469,7 @@ TYPED_TEST(HorizontalDivTest, TestDiv3D2FRandom) {
   // Verify results for second field
   for (int i = 0; i < nproma; ++i) {
     for (int k = 0; k < nlev; ++k) {
-      EXPECT_NEAR(this->f4dout[f4dout_at(i, k, 0, 0)],
+      EXPECT_NEAR(f4dout_h[f4dout_at(i, k, 0, 0)],
                   ref_f4dout[f4dout_at(i, k, 0, 0)], 1e-5)
           << "Second field results differ at i=" << i << ", k=" << k;
     }
@@ -415,42 +488,59 @@ TYPED_TEST(HorizontalDivTest, TestDiv4DSpecific) {
   const auto &f4din_at = at<nproma, nlev, nblks_e, dim4d>;
   const auto &f4dout_at = at<nproma, nlev, nblks_c, dim4d>;
 
+  // Create mirror views to store data on the host
+  auto cell_edge_idx_h = Kokkos::create_mirror_view(this->cell_edge_idx);
+  auto cell_edge_blk_h = Kokkos::create_mirror_view(this->cell_edge_blk);
+  auto geofac_div_h = Kokkos::create_mirror_view(this->geofac_div);
+  auto f4din_h = Kokkos::create_mirror_view(this->f4din);
+  auto f4dout_h = Kokkos::create_mirror_view(this->f4dout);
+
   // Initialization
   for (int i = 0; i < nproma; ++i) {
     for (int j = 0; j < 3; ++j) {
-      this->cell_edge_idx[cell_edge_at(i, 0, j)] = (i + j) % nproma;
-      this->cell_edge_blk[cell_edge_at(i, 0, j)] = 0;
-      this->geofac_div[geofac_div_at(i, j, 0)] = 0.1 * (j + 1);
+      cell_edge_idx_h[cell_edge_at(i, 0, j)] = (i + j) % nproma;
+      cell_edge_blk_h[cell_edge_at(i, 0, j)] = 0;
+      geofac_div_h[geofac_div_at(i, j, 0)] = static_cast<TypeParam>(0.1 * (j + 1));
     }
 
     for (int k = 0; k < nlev; ++k) {
       for (int d = 0; d < dim4d; ++d) {
-        this->f4din[f4din_at(i, k, 0, d)] = 1.0 + i + k + d;
-        this->f4dout[f4dout_at(i, k, 0, d)] = 0.0;
+        f4din_h[f4din_at(i, k, 0, d)] = static_cast<TypeParam>(1.0 + i + k + d);
+        f4dout_h[f4dout_at(i, k, 0, d)] = static_cast<TypeParam>(0.0);
       }
     }
   }
 
+  // Copy initialized data to device
+  Kokkos::deep_copy(this->cell_edge_idx, cell_edge_idx_h);
+  Kokkos::deep_copy(this->cell_edge_blk, cell_edge_blk_h);
+  Kokkos::deep_copy(this->geofac_div, geofac_div_h);
+  Kokkos::deep_copy(this->f4din, f4din_h);
+  Kokkos::deep_copy(this->f4dout, f4dout_h);
+
   // Test function
-  div4d<TypeParam>(this->cell_edge_idx.data(), this->cell_edge_blk.data(),
-                   this->geofac_div.data(), this->f4din.data(),
-                   this->f4dout.data(), this->dim4d, this->i_startblk,
-                   this->i_endblk, this->i_startidx_in, this->i_endidx_in,
-                   this->slev.data(), this->elev.data(), this->nproma,
-                   this->lacc, this->nlev, this->nblks_c, this->nblks_e);
-
-  EXPECT_NEAR(this->f4dout[f4dout_at(0, 0, 0, 0)], 1.4, 1e-6);
-  EXPECT_NEAR(this->f4dout[f4dout_at(1, 0, 0, 0)], 1.1, 1e-6);
-  EXPECT_NEAR(this->f4dout[f4dout_at(2, 0, 0, 0)], 1.1, 1e-6);
-  EXPECT_NEAR(this->f4dout[f4dout_at(0, 1, 0, 0)], 2.0, 1e-6);
-  EXPECT_NEAR(this->f4dout[f4dout_at(1, 1, 0, 0)], 1.7, 1e-6);
-  EXPECT_NEAR(this->f4dout[f4dout_at(2, 1, 0, 0)], 1.7, 1e-6);
-  EXPECT_NEAR(this->f4dout[f4dout_at(0, 0, 0, 1)], 2.0, 1e-6);
-  EXPECT_NEAR(this->f4dout[f4dout_at(1, 0, 0, 1)], 1.7, 1e-6);
-  EXPECT_NEAR(this->f4dout[f4dout_at(2, 0, 0, 1)], 1.7, 1e-6);
-  EXPECT_NEAR(this->f4dout[f4dout_at(0, 1, 0, 1)], 2.6, 1e-6);
-  EXPECT_NEAR(this->f4dout[f4dout_at(1, 1, 0, 1)], 2.3, 1e-6);
-  EXPECT_NEAR(this->f4dout[f4dout_at(2, 1, 0, 1)], 2.3, 1e-6);
+  div4d<TypeParam>(
+    this->cell_edge_idx.data(), this->cell_edge_blk.data(),
+    this->geofac_div.data(), this->f4din.data(), this->f4dout.data(),
+    this->dim4d, this->i_startblk, this->i_endblk, this->i_startidx_in,
+    this->i_endidx_in, this->slev.data(), this->elev.data(), this->nproma,
+    this->lacc, this->nlev, this->nblks_c, this->nblks_e);
+
+  // Copy results back to host for verification
+  Kokkos::deep_copy(f4dout_h, this->f4dout);
+
+  EXPECT_NEAR(f4dout_h[f4dout_at(0, 0, 0, 0)], static_cast<TypeParam>(1.4), 1e-6);
+  EXPECT_NEAR(f4dout_h[f4dout_at(1, 0, 0, 0)], static_cast<TypeParam>(1.1), 1e-6);
+  EXPECT_NEAR(f4dout_h[f4dout_at(2, 0, 0, 0)], static_cast<TypeParam>(1.1), 1e-6);
+  EXPECT_NEAR(f4dout_h[f4dout_at(0, 1, 0, 0)], static_cast<TypeParam>(2.0), 1e-6);
+  EXPECT_NEAR(f4dout_h[f4dout_at(1, 1, 0, 0)], static_cast<TypeParam>(1.7), 1e-6);
+  EXPECT_NEAR(f4dout_h[f4dout_at(2, 1, 0, 0)], static_cast<TypeParam>(1.7), 1e-6);
+  EXPECT_NEAR(f4dout_h[f4dout_at(0, 0, 0, 1)], static_cast<TypeParam>(2.0), 1e-6);
+  EXPECT_NEAR(f4dout_h[f4dout_at(1, 0, 0, 1)], static_cast<TypeParam>(1.7), 1e-6);
+  EXPECT_NEAR(f4dout_h[f4dout_at(2, 0, 0, 1)], static_cast<TypeParam>(1.7), 1e-6);
+  EXPECT_NEAR(f4dout_h[f4dout_at(0, 1, 0, 1)], static_cast<TypeParam>(2.6), 1e-6);
+  EXPECT_NEAR(f4dout_h[f4dout_at(1, 1, 0, 1)], static_cast<TypeParam>(2.3), 1e-6);
+  EXPECT_NEAR(f4dout_h[f4dout_at(2, 1, 0, 1)], static_cast<TypeParam>(2.3), 1e-6);
 }
 
 TYPED_TEST(HorizontalDivTest, TestDiv4DRandom) {
@@ -465,6 +555,13 @@ TYPED_TEST(HorizontalDivTest, TestDiv4DRandom) {
   const auto &f4din_at = at<nproma, nlev, nblks_e, dim4d>;
   const auto &f4dout_at = at<nproma, nlev, nblks_c, dim4d>;
 
+  // Create mirror views to store data on the host
+  auto cell_edge_idx_h = Kokkos::create_mirror_view(this->cell_edge_idx);
+  auto cell_edge_blk_h = Kokkos::create_mirror_view(this->cell_edge_blk);
+  auto geofac_div_h = Kokkos::create_mirror_view(this->geofac_div);
+  auto f4din_h = Kokkos::create_mirror_view(this->f4din);
+  auto f4dout_h = Kokkos::create_mirror_view(this->f4dout);
+
   std::random_device rd;
   std::mt19937 gen(rd());
   std::uniform_int_distribution<int> int_distrib(0, nproma - 1);
@@ -473,26 +570,36 @@ TYPED_TEST(HorizontalDivTest, TestDiv4DRandom) {
   // Initialize with random values
   for (int i = 0; i < nproma; ++i) {
     for (int j = 0; j < 3; ++j) {
-      this->cell_edge_idx[cell_edge_at(i, 0, j)] = int_distrib(gen);
-      this->cell_edge_blk[cell_edge_at(i, 0, j)] = 0;
-      this->geofac_div[geofac_div_at(i, j, 0)] = real_distrib(gen);
+      cell_edge_idx_h[cell_edge_at(i, 0, j)] = int_distrib(gen);
+      cell_edge_blk_h[cell_edge_at(i, 0, j)] = 0;
+      geofac_div_h[geofac_div_at(i, j, 0)] = real_distrib(gen);
     }
 
     for (int k = 0; k < nlev; ++k) {
       for (int d = 0; d < dim4d; ++d) {
-        this->f4din[f4din_at(i, k, 0, d)] = real_distrib(gen);
-        this->f4dout[f4dout_at(i, k, 0, d)] = 0.0;
+        f4din_h[f4din_at(i, k, 0, d)] = real_distrib(gen);
+        f4dout_h[f4dout_at(i, k, 0, d)] = static_cast<TypeParam>(0.0);
       }
     }
   }
 
+  // Copy initialized data to device
+  Kokkos::deep_copy(this->cell_edge_idx, cell_edge_idx_h);
+  Kokkos::deep_copy(this->cell_edge_blk, cell_edge_blk_h);
+  Kokkos::deep_copy(this->geofac_div, geofac_div_h);
+  Kokkos::deep_copy(this->f4din, f4din_h);
+  Kokkos::deep_copy(this->f4dout, f4dout_h);
+
   // Test function
-  div4d<TypeParam>(this->cell_edge_idx.data(), this->cell_edge_blk.data(),
-                   this->geofac_div.data(), this->f4din.data(),
-                   this->f4dout.data(), this->dim4d, this->i_startblk,
-                   this->i_endblk, this->i_startidx_in, this->i_endidx_in,
-                   this->slev.data(), this->elev.data(), this->nproma,
-                   this->lacc, this->nlev, this->nblks_c, this->nblks_e);
+  div4d<TypeParam>(
+    this->cell_edge_idx.data(), this->cell_edge_blk.data(),
+    this->geofac_div.data(), this->f4din.data(), this->f4dout.data(),
+    this->dim4d, this->i_startblk, this->i_endblk, this->i_startidx_in,
+    this->i_endidx_in, this->slev.data(), this->elev.data(), this->nproma,
+    this->lacc, this->nlev, this->nblks_c, this->nblks_e);
+
+  // Copy results back to host for verification
+  Kokkos::deep_copy(f4dout_h, this->f4dout);
 
   // Compute reference result and check
   for (int jb = this->i_startblk; jb < this->i_endblk; ++jb) {
@@ -506,13 +613,13 @@ TYPED_TEST(HorizontalDivTest, TestDiv4DRandom) {
           TypeParam expected = 0.0;
           for (int je = 0; je < 3; ++je) {
             expected +=
-                this->f4din[f4din_at(
-                    this->cell_edge_idx[cell_edge_at(jc, jb, je)], jk,
-                    this->cell_edge_blk[cell_edge_at(jc, jb, je)], ji)] *
-                this->geofac_div[geofac_div_at(jc, je, jb)];
+                f4din_h[f4din_at(
+                    cell_edge_idx_h[cell_edge_at(jc, jb, je)], jk,
+                    cell_edge_blk_h[cell_edge_at(jc, jb, je)], ji)] *
+                geofac_div_h[geofac_div_at(jc, je, jb)];
           }
 
-          EXPECT_NEAR(this->f4dout[f4dout_at(jc, jk, jb, ji)], expected, 1e-5)
+          EXPECT_NEAR(f4dout_h[f4dout_at(jc, jk, jb, ji)], expected, 1e-5)
               << "Random test fails at jc=" << jc << ", jk=" << jk
               << ", jb=" << jb << ", ji=" << ji;
         }
@@ -521,8 +628,7 @@ TYPED_TEST(HorizontalDivTest, TestDiv4DRandom) {
   }
 }
 
-TYPED_TEST_SUITE(HorizontalDivTest, ValueTypes);
-
+/*
 TYPED_TEST(HorizontalDivTest, TestDivAvgSpecific) {
   constexpr int nproma = this->nproma;
   constexpr int nlev = this->nlev;
@@ -1068,3 +1174,4 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgRandomNoL2fields) {
     }
   }
 }
+*/
-- 
GitLab


From c20359160d2664885501fa56fab9fb473fa77df6 Mon Sep 17 00:00:00 2001
From: Pradipta Samanta <samanta@dkrz.de>
Date: Tue, 25 Mar 2025 10:44:28 +0100
Subject: [PATCH 02/34] fixed the errors in gitlab-ci.yml

---
 .gitlab-ci.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index c12492b..5e0d58f 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -65,7 +65,7 @@ nvhpc_cpu:
   variables:
     SCHEDULER_PARAMETERS: "$SLURM_OPTIONS_CPU $SLURM_NTASKS"
   script:
-    - module load git gcc/11.2.0-gcc-11.2.0 nvhpc/24.7-gcc-11.2.0
+    - module load git gcc/11.2.0-gcc-11.2.0 nvhpc/22.5-gcc-11.2.0
     - mkdir nvhpc_cpu
     - cd nvhpc_cpu
     - export LD_LIBRARY_PATH=/sw/spack-levante/gcc-11.2.0-bcn7mb/lib64:$LD_LIBRARY_PATH
@@ -95,7 +95,7 @@ nvhpc_gpu:
     - module load git gcc/11.2.0-gcc-11.2.0 nvhpc/24.7-gcc-11.2.0
     - mkdir nvhpc_gpu
     - cd nvhpc_gpu
-    - /sw/spack-levante/cmake-3.23.1-q5kzz6/bin/cmake .. -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_C_COMPILER=nvc -DCMAKE_Fortran_COMPILER=nvfortran -DIM_ENABLE_OPENACC=ON
+    - /sw/spack-levante/cmake-3.23.1-q5kzz6/bin/cmake .. -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_C_COMPILER=nvc -DCMAKE_Fortran_COMPILER=nvfortran -DIM_ENABLE_GPU=nvidia-sm80 -DIM_ENABLE_OPENACC=ON
     - make VERBOSE=1
     - make test
   tags:
@@ -112,7 +112,7 @@ OpenMP_gcc:
     - module load git gcc/11.2.0-gcc-11.2.0
     - mkdir openmp_gcc
     - cd openmp_gcc
-    - /sw/spack-levante/cmake-3.23.1-q5kzz6/bin/cmake .. -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_C_COMPILER=gcc -DCMAKE_Fortran_COMPILER=gfortran -DIM_ENABLE_GPU=nvidia-sm80 -DIM_ENABLE_OPENMP=ON
+    - /sw/spack-levante/cmake-3.23.1-q5kzz6/bin/cmake .. -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_C_COMPILER=gcc -DCMAKE_Fortran_COMPILER=gfortran -DIM_ENABLE_OPENMP=ON
     - make VERBOSE=1
     - make test
   tags:
-- 
GitLab


From ff794e816f0e2b817fe154549d1caf990f0d6df1 Mon Sep 17 00:00:00 2001
From: Pradipta Samanta <samanta@dkrz.de>
Date: Tue, 25 Mar 2025 14:44:37 +0100
Subject: [PATCH 03/34] exported ld_Library_path for nvhpc_gpu

---
 .gitlab-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 5e0d58f..6877a94 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -68,7 +68,6 @@ nvhpc_cpu:
     - module load git gcc/11.2.0-gcc-11.2.0 nvhpc/22.5-gcc-11.2.0
     - mkdir nvhpc_cpu
     - cd nvhpc_cpu
-    - export LD_LIBRARY_PATH=/sw/spack-levante/gcc-11.2.0-bcn7mb/lib64:$LD_LIBRARY_PATH
     - /sw/spack-levante/cmake-3.23.1-q5kzz6/bin/cmake .. -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_C_COMPILER=nvc -DCMAKE_Fortran_COMPILER=nvfortran
     - make VERBOSE=1
     - make test
@@ -95,6 +94,7 @@ nvhpc_gpu:
     - module load git gcc/11.2.0-gcc-11.2.0 nvhpc/24.7-gcc-11.2.0
     - mkdir nvhpc_gpu
     - cd nvhpc_gpu
+    - export LD_LIBRARY_PATH=/sw/spack-levante/gcc-11.2.0-bcn7mb/lib64:$LD_LIBRARY_PATH
     - /sw/spack-levante/cmake-3.23.1-q5kzz6/bin/cmake .. -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_C_COMPILER=nvc -DCMAKE_Fortran_COMPILER=nvfortran -DIM_ENABLE_GPU=nvidia-sm80 -DIM_ENABLE_OPENACC=ON
     - make VERBOSE=1
     - make test
-- 
GitLab


From 34ea0a4daf9c4ba9d22a4d7f0d0a8ab81e209fc4 Mon Sep 17 00:00:00 2001
From: Pradipta Samanta <samanta@dkrz.de>
Date: Tue, 25 Mar 2025 18:31:31 +0100
Subject: [PATCH 04/34] corrected memory allocations of two arrays

---
 src/horizontal/mo_lib_divrot.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/horizontal/mo_lib_divrot.cpp b/src/horizontal/mo_lib_divrot.cpp
index d086e8b..67d13fd 100644
--- a/src/horizontal/mo_lib_divrot.cpp
+++ b/src/horizontal/mo_lib_divrot.cpp
@@ -1093,8 +1093,8 @@ void div_avg(const T *vec_e, const int *cell_neighbor_idx,
   UnmanagedConstInt3D ieidx(cell_edge_idx, nproma, nblks_c, 3);
   UnmanagedConstInt3D ieblk(cell_edge_blk, nproma, nblks_c, 3);
 
-  UnmanagedConstT3D geofac_div_view(geofac_div, nproma, 4, nblks_e);
-  UnmanagedConstT3D avg_coeff_view(avg_coeff, nproma, nlev, nblks_c);
+  UnmanagedConstT3D geofac_div_view(geofac_div, nproma, 3, nblks_e);
+  UnmanagedConstT3D avg_coeff_view(avg_coeff, nproma, 4, nblks_c);
 
   UnmanagedT3D div_vec_c_view(div_vec_c, nproma, nlev, nblks_c);
 
-- 
GitLab


From 5a60cb4db63aabea0f999527464e17af7c182c44 Mon Sep 17 00:00:00 2001
From: Pradipta Samanta <samanta@dkrz.de>
Date: Tue, 25 Mar 2025 18:46:00 +0100
Subject: [PATCH 05/34] modified rest of the tests in test_horizontal_div

---
 test/c/test_horizontal_div.cpp | 441 ++++++++++++++++++++-------------
 1 file changed, 273 insertions(+), 168 deletions(-)

diff --git a/test/c/test_horizontal_div.cpp b/test/c/test_horizontal_div.cpp
index 2d06bc0..bc6ea9f 100644
--- a/test/c/test_horizontal_div.cpp
+++ b/test/c/test_horizontal_div.cpp
@@ -68,7 +68,7 @@ protected:
         f4dout("f4dout", dim_combine(nproma, nlev, nblks_c, dim4d)),
         cell_neighbor_idx("cell_neighbor_idx", dim_combine(nproma, nblks_c, 3)),
         cell_neighbor_blk("cell_neighbor_blk", dim_combine(nproma, nblks_c, 3)),
-        avg_coeff("avg_coeff", dim_combine(nproma, nlev, nblks_c)),
+        avg_coeff("avg_coeff", dim_combine(nproma, 4, nblks_c)),
         opt_in2("opt_in2", dim_combine(nproma, nlev, nblks_e)),
         opt_out2("opt_out2", dim_combine(nproma, nlev, nblks_c))
   {
@@ -628,7 +628,6 @@ TYPED_TEST(HorizontalDivTest, TestDiv4DRandom) {
   }
 }
 
-/*
 TYPED_TEST(HorizontalDivTest, TestDivAvgSpecific) {
   constexpr int nproma = this->nproma;
   constexpr int nlev = this->nlev;
@@ -640,6 +639,8 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgSpecific) {
   const auto &cell_edge_at = at<nproma, nblks_c, 3>;
   const auto &geofac_div_at = at<nproma, 3, nblks_c>;
   const auto &div_vec_c_at = at<nproma, nlev, nblks_c>;
+  const auto &cell_neighbor_at = at<nproma, nblks_c, 3>;
+  const auto &avg_coeff_at = at<nproma, 4, nblks_c>;
 
   // Vectors for additional parameters
   // Vectors for block and index ranges
@@ -653,51 +654,71 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgSpecific) {
   bool l_limited_area = true;
   bool l2fields = true;
 
-  const auto &cell_neighbor_at = at<nproma, nblks_c, 3>;
-  const auto &avg_coeff_at = at<nproma, 4, nblks_c>;
+  // Create mirror views to store data on the host
+  auto vec_e_h = Kokkos::create_mirror_view(this->vec_e);
+  auto cell_neighbor_idx_h = Kokkos::create_mirror_view(this->cell_neighbor_idx);
+  auto cell_neighbor_blk_h = Kokkos::create_mirror_view(this->cell_neighbor_blk);
+  auto cell_edge_idx_h = Kokkos::create_mirror_view(this->cell_edge_idx);
+  auto cell_edge_blk_h = Kokkos::create_mirror_view(this->cell_edge_blk);
+  auto geofac_div_h = Kokkos::create_mirror_view(this->geofac_div);
+  auto avg_coeff_h = Kokkos::create_mirror_view(this->avg_coeff);
+  auto div_vec_c_h = Kokkos::create_mirror_view(this->div_vec_c);
+  auto opt_in2_h = Kokkos::create_mirror_view(this->opt_in2);
+  auto opt_out2_h = Kokkos::create_mirror_view(this->opt_out2);
 
   // Initialize the vectors with specific values
   for (int i = 0; i < nproma; ++i) {
     for (int k = 0; k < nlev; ++k) {
-      this->vec_e[vec_e_at(i, k, 0)] = (i + 1) * (k + 1); // Simple pattern
-      this->opt_in2[vec_e_at(i, k, 0)] =
-          (i + 1) * (k + 1) * 0.5; // Half of vec_e
+      vec_e_h[vec_e_at(i, k, 0)] = static_cast<TypeParam>((i + 1) * (k + 1)); // Simple pattern
+      opt_in2_h[vec_e_at(i, k, 0)] = static_cast<TypeParam>((i + 1) * (k + 1) * 0.5); // Half of vec_e
     }
 
     // Set edge indices to point to specific cells
-    this->cell_edge_idx[cell_edge_at(i, 0, 0)] = i;
-    this->cell_edge_idx[cell_edge_at(i, 0, 1)] = (i + 1) % nproma;
-    this->cell_edge_idx[cell_edge_at(i, 0, 2)] = (i + 2) % nproma;
+    cell_edge_idx_h[cell_edge_at(i, 0, 0)] = i;
+    cell_edge_idx_h[cell_edge_at(i, 0, 1)] = (i + 1) % nproma;
+    cell_edge_idx_h[cell_edge_at(i, 0, 2)] = (i + 2) % nproma;
 
     // Set neighbor indices similarly
-    this->cell_neighbor_idx[cell_neighbor_at(i, 0, 0)] = i;
-    this->cell_neighbor_idx[cell_neighbor_at(i, 0, 1)] = (i + 1) % nproma;
-    this->cell_neighbor_idx[cell_neighbor_at(i, 0, 2)] = (i + 2) % nproma;
+    cell_neighbor_idx_h[cell_neighbor_at(i, 0, 0)] = i;
+    cell_neighbor_idx_h[cell_neighbor_at(i, 0, 1)] = (i + 1) % nproma;
+    cell_neighbor_idx_h[cell_neighbor_at(i, 0, 2)] = (i + 2) % nproma;
 
     // All edges and neighbors are in the same block for this test
     for (int j = 0; j < 3; ++j) {
-      this->cell_edge_blk[cell_edge_at(i, 0, j)] = 0;
-      this->cell_neighbor_blk[cell_neighbor_at(i, 0, j)] = 0;
+      cell_edge_blk_h[cell_edge_at(i, 0, j)] = 0;
+      cell_neighbor_blk_h[cell_neighbor_at(i, 0, j)] = 0;
     }
 
     // Geometric factors
-    this->geofac_div[geofac_div_at(i, 0, 0)] = 0.5;
-    this->geofac_div[geofac_div_at(i, 1, 0)] = 0.3;
-    this->geofac_div[geofac_div_at(i, 2, 0)] = 0.2;
+    geofac_div_h[geofac_div_at(i, 0, 0)] = static_cast<TypeParam>(0.5);
+    geofac_div_h[geofac_div_at(i, 1, 0)] = static_cast<TypeParam>(0.3);
+    geofac_div_h[geofac_div_at(i, 2, 0)] = static_cast<TypeParam>(0.2);
 
     // Average coefficients
-    this->avg_coeff[avg_coeff_at(i, 0, 0)] = 0.4; // Self
-    this->avg_coeff[avg_coeff_at(i, 1, 0)] = 0.2; // First neighbor
-    this->avg_coeff[avg_coeff_at(i, 2, 0)] = 0.2; // Second neighbor
-    this->avg_coeff[avg_coeff_at(i, 3, 0)] = 0.2; // Third neighbor
+    avg_coeff_h[avg_coeff_at(i, 0, 0)] = static_cast<TypeParam>(0.4); // Self
+    avg_coeff_h[avg_coeff_at(i, 1, 0)] = static_cast<TypeParam>(0.2); // First neighbor
+    avg_coeff_h[avg_coeff_at(i, 2, 0)] = static_cast<TypeParam>(0.2); // Second neighbor
+    avg_coeff_h[avg_coeff_at(i, 3, 0)] = static_cast<TypeParam>(0.2); // Third neighbor
 
     // Initialize div_vec_c and opt_out2 to zero
     for (int k = 0; k < nlev; ++k) {
-      this->div_vec_c[div_vec_c_at(i, k, 0)] = 0.0;
-      this->opt_out2[div_vec_c_at(i, k, 0)] = 0.0;
+      div_vec_c_h[div_vec_c_at(i, k, 0)] = static_cast<TypeParam>(0.0);
+      opt_out2_h[div_vec_c_at(i, k, 0)] = static_cast<TypeParam>(0.0);
     }
   }
 
+  // Copy initialized data to device
+  Kokkos::deep_copy(this->vec_e, vec_e_h);
+  Kokkos::deep_copy(this->cell_neighbor_idx, cell_neighbor_idx_h);
+  Kokkos::deep_copy(this->cell_neighbor_blk, cell_neighbor_blk_h);
+  Kokkos::deep_copy(this->cell_edge_idx, cell_edge_idx_h);
+  Kokkos::deep_copy(this->cell_edge_blk, cell_edge_blk_h);
+  Kokkos::deep_copy(this->geofac_div, geofac_div_h);
+  Kokkos::deep_copy(this->avg_coeff, avg_coeff_h);
+  Kokkos::deep_copy(this->div_vec_c, div_vec_c_h);
+  Kokkos::deep_copy(this->opt_in2, opt_in2_h);
+  Kokkos::deep_copy(this->opt_out2, opt_out2_h);
+
   // Call the div_avg function
   div_avg<TypeParam>(
       this->vec_e.data(), this->cell_neighbor_idx.data(),
@@ -709,19 +730,25 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgSpecific) {
       this->nproma, patch_id, l_limited_area, l2fields, this->lacc, this->nlev,
       this->nblks_c, this->nblks_e);
 
-  EXPECT_NEAR(this->div_vec_c[div_vec_c_at(0, 0, 0)], 1.88, 1e-6);
-  EXPECT_NEAR(this->div_vec_c[div_vec_c_at(0, 1, 0)], 3.76, 1e-6);
-  EXPECT_NEAR(this->div_vec_c[div_vec_c_at(1, 0, 0)], 2.04, 1e-6);
-  EXPECT_NEAR(this->div_vec_c[div_vec_c_at(1, 1, 0)], 4.08, 1e-6);
-  EXPECT_NEAR(this->div_vec_c[div_vec_c_at(2, 0, 0)], 2.08, 1e-6);
-  EXPECT_NEAR(this->div_vec_c[div_vec_c_at(2, 1, 0)], 4.16, 1e-6);
-
-  EXPECT_NEAR(this->opt_out2[div_vec_c_at(0, 0, 0)], 0.94, 1e-6);
-  EXPECT_NEAR(this->opt_out2[div_vec_c_at(0, 1, 0)], 1.88, 1e-6);
-  EXPECT_NEAR(this->opt_out2[div_vec_c_at(1, 0, 0)], 1.02, 1e-6);
-  EXPECT_NEAR(this->opt_out2[div_vec_c_at(1, 1, 0)], 2.04, 1e-6);
-  EXPECT_NEAR(this->opt_out2[div_vec_c_at(2, 0, 0)], 1.04, 1e-6);
-  EXPECT_NEAR(this->opt_out2[div_vec_c_at(2, 1, 0)], 2.08, 1e-6);
+  // Copy results back to host for verification
+  Kokkos::deep_copy(div_vec_c_h, this->div_vec_c);
+  Kokkos::deep_copy(opt_out2_h, this->opt_out2);
+
+  // Verify first field results
+  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(0, 0, 0)], static_cast<TypeParam>(1.88), 1e-6);
+  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(0, 1, 0)], static_cast<TypeParam>(3.76), 1e-6);
+  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(1, 0, 0)], static_cast<TypeParam>(2.04), 1e-6);
+  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(1, 1, 0)], static_cast<TypeParam>(4.08), 1e-6);
+  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(2, 0, 0)], static_cast<TypeParam>(2.08), 1e-6);
+  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(2, 1, 0)], static_cast<TypeParam>(4.16), 1e-6);
+
+  // Verify second field results
+  EXPECT_NEAR(opt_out2_h[div_vec_c_at(0, 0, 0)], static_cast<TypeParam>(0.94), 1e-6);
+  EXPECT_NEAR(opt_out2_h[div_vec_c_at(0, 1, 0)], static_cast<TypeParam>(1.88), 1e-6);
+  EXPECT_NEAR(opt_out2_h[div_vec_c_at(1, 0, 0)], static_cast<TypeParam>(1.02), 1e-6);
+  EXPECT_NEAR(opt_out2_h[div_vec_c_at(1, 1, 0)], static_cast<TypeParam>(2.04), 1e-6);
+  EXPECT_NEAR(opt_out2_h[div_vec_c_at(2, 0, 0)], static_cast<TypeParam>(1.04), 1e-6);
+  EXPECT_NEAR(opt_out2_h[div_vec_c_at(2, 1, 0)], static_cast<TypeParam>(2.08), 1e-6);
 }
 
 TYPED_TEST(HorizontalDivTest, TestDivAvgRandom) {
@@ -749,47 +776,69 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgRandom) {
   const auto &cell_neighbor_at = at<nproma, nblks_c, 3>;
   const auto &avg_coeff_at = at<nproma, 4, nblks_c>;
 
+  // Create mirror views to store data on the host
+  auto vec_e_h = Kokkos::create_mirror_view(this->vec_e);
+  auto cell_neighbor_idx_h = Kokkos::create_mirror_view(this->cell_neighbor_idx);
+  auto cell_neighbor_blk_h = Kokkos::create_mirror_view(this->cell_neighbor_blk);
+  auto cell_edge_idx_h = Kokkos::create_mirror_view(this->cell_edge_idx);
+  auto cell_edge_blk_h = Kokkos::create_mirror_view(this->cell_edge_blk);
+  auto geofac_div_h = Kokkos::create_mirror_view(this->geofac_div);
+  auto avg_coeff_h = Kokkos::create_mirror_view(this->avg_coeff);
+  auto div_vec_c_h = Kokkos::create_mirror_view(this->div_vec_c);
+  auto opt_in2_h = Kokkos::create_mirror_view(this->opt_in2);
+  auto opt_out2_h = Kokkos::create_mirror_view(this->opt_out2);
+
   // Set up random number generators
   std::random_device rd;
   std::mt19937 gen(rd());
   std::uniform_int_distribution<int> int_distrib(0, nproma - 1);
-  std::uniform_real_distribution<TypeParam> real_distrib(-10.0, 10.0);
+  std::uniform_real_distribution<TypeParam> real_distrib(-1.0, 1.0);
 
   // Initialize with random values
   for (int i = 0; i < nproma; ++i) {
     for (int k = 0; k < nlev; ++k) {
-      this->vec_e[vec_e_at(i, k, 0)] = real_distrib(gen);
-      this->opt_in2[vec_e_at(i, k, 0)] = real_distrib(gen);
+      vec_e_h[vec_e_at(i, k, 0)] = real_distrib(gen);
+      opt_in2_h[vec_e_at(i, k, 0)] = real_distrib(gen);
     }
 
     // Set random edge indices
     for (int j = 0; j < 3; ++j) {
-      this->cell_edge_idx[cell_edge_at(i, 0, j)] = int_distrib(gen);
-      this->cell_edge_blk[cell_edge_at(i, 0, j)] =
-          0; // Keep in same block for simplicity
+      cell_edge_idx_h[cell_edge_at(i, 0, j)] = int_distrib(gen);
+      cell_edge_blk_h[cell_edge_at(i, 0, j)] = 0; // Keep in same block for simplicity
 
-      this->cell_neighbor_idx[cell_neighbor_at(i, 0, j)] = int_distrib(gen);
-      this->cell_neighbor_blk[cell_neighbor_at(i, 0, j)] =
-          0; // Keep in same block for simplicity
+      cell_neighbor_idx_h[cell_neighbor_at(i, 0, j)] = int_distrib(gen);
+      cell_neighbor_blk_h[cell_neighbor_at(i, 0, j)] = 0; // Keep in same block for simplicity
     }
 
     // Random geometric factors
     for (int j = 0; j < 3; ++j) {
-      this->geofac_div[geofac_div_at(i, j, 0)] = real_distrib(gen);
+      geofac_div_h[geofac_div_at(i, j, 0)] = real_distrib(gen);
     }
 
     // Random average coefficients
     for (int j = 0; j < 4; ++j) {
-      this->avg_coeff[avg_coeff_at(i, j, 0)] = real_distrib(gen);
+      avg_coeff_h[avg_coeff_at(i, j, 0)] = real_distrib(gen);
     }
 
     // Random initial values for div_vec_c and opt_out2
     for (int k = 0; k < nlev; ++k) {
-      this->div_vec_c[div_vec_c_at(i, k, 0)] = real_distrib(gen);
-      this->opt_out2[div_vec_c_at(i, k, 0)] = real_distrib(gen);
+      div_vec_c_h[div_vec_c_at(i, k, 0)] = static_cast<TypeParam>(0.0);
+      opt_out2_h[div_vec_c_at(i, k, 0)] = static_cast<TypeParam>(0.0);
     }
   }
 
+  // Copy initialized data to device
+  Kokkos::deep_copy(this->vec_e, vec_e_h);
+  Kokkos::deep_copy(this->cell_neighbor_idx, cell_neighbor_idx_h);
+  Kokkos::deep_copy(this->cell_neighbor_blk, cell_neighbor_blk_h);
+  Kokkos::deep_copy(this->cell_edge_idx, cell_edge_idx_h);
+  Kokkos::deep_copy(this->cell_edge_blk, cell_edge_blk_h);
+  Kokkos::deep_copy(this->geofac_div, geofac_div_h);
+  Kokkos::deep_copy(this->avg_coeff, avg_coeff_h);
+  Kokkos::deep_copy(this->div_vec_c, div_vec_c_h);
+  Kokkos::deep_copy(this->opt_in2, opt_in2_h);
+  Kokkos::deep_copy(this->opt_out2, opt_out2_h);
+
   // Call the div_avg function
   div_avg<TypeParam>(
       this->vec_e.data(), this->cell_neighbor_idx.data(),
@@ -801,6 +850,10 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgRandom) {
       this->nproma, patch_id, l_limited_area, l2fields, this->lacc, this->nlev,
       this->nblks_c, this->nblks_e);
 
+  // Copy results back to host for verification
+  Kokkos::deep_copy(div_vec_c_h, this->div_vec_c);
+  Kokkos::deep_copy(opt_out2_h, this->opt_out2);
+
   // Calculate reference values manually
   std::vector<TypeParam> aux_c(dim_combine(nproma, nlev, nblks_c));
   std::vector<TypeParam> aux_c2(dim_combine(nproma, nlev, nblks_c));
@@ -816,32 +869,32 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgRandom) {
     for (int jk = this->slev[0]; jk < this->elev[0]; ++jk) {
       for (int jc = i_startidx; jc < i_endidx; ++jc) {
         aux_c[div_vec_c_at(jc, jk, jb)] =
-            this->vec_e[vec_e_at(
-                this->cell_edge_idx[cell_edge_at(jc, jb, 0)], jk,
-                this->cell_edge_blk[cell_edge_at(jc, jb, 0)])] *
-                this->geofac_div[geofac_div_at(jc, 0, jb)] +
-            this->vec_e[vec_e_at(
-                this->cell_edge_idx[cell_edge_at(jc, jb, 1)], jk,
-                this->cell_edge_blk[cell_edge_at(jc, jb, 1)])] *
-                this->geofac_div[geofac_div_at(jc, 1, jb)] +
-            this->vec_e[vec_e_at(
-                this->cell_edge_idx[cell_edge_at(jc, jb, 2)], jk,
-                this->cell_edge_blk[cell_edge_at(jc, jb, 2)])] *
-                this->geofac_div[geofac_div_at(jc, 2, jb)];
+            vec_e_h[vec_e_at(
+                cell_edge_idx_h[cell_edge_at(jc, jb, 0)], jk,
+                cell_edge_blk_h[cell_edge_at(jc, jb, 0)])] *
+                geofac_div_h[geofac_div_at(jc, 0, jb)] +
+            vec_e_h[vec_e_at(
+                cell_edge_idx_h[cell_edge_at(jc, jb, 1)], jk,
+                cell_edge_blk_h[cell_edge_at(jc, jb, 1)])] *
+                geofac_div_h[geofac_div_at(jc, 1, jb)] +
+            vec_e_h[vec_e_at(
+                cell_edge_idx_h[cell_edge_at(jc, jb, 2)], jk,
+                cell_edge_blk_h[cell_edge_at(jc, jb, 2)])] *
+                geofac_div_h[geofac_div_at(jc, 2, jb)];
 
         aux_c2[div_vec_c_at(jc, jk, jb)] =
-            this->opt_in2[vec_e_at(
-                this->cell_edge_idx[cell_edge_at(jc, jb, 0)], jk,
-                this->cell_edge_blk[cell_edge_at(jc, jb, 0)])] *
-                this->geofac_div[geofac_div_at(jc, 0, jb)] +
-            this->opt_in2[vec_e_at(
-                this->cell_edge_idx[cell_edge_at(jc, jb, 1)], jk,
-                this->cell_edge_blk[cell_edge_at(jc, jb, 1)])] *
-                this->geofac_div[geofac_div_at(jc, 1, jb)] +
-            this->opt_in2[vec_e_at(
-                this->cell_edge_idx[cell_edge_at(jc, jb, 2)], jk,
-                this->cell_edge_blk[cell_edge_at(jc, jb, 2)])] *
-                this->geofac_div[geofac_div_at(jc, 2, jb)];
+            opt_in2_h[vec_e_at(
+                cell_edge_idx_h[cell_edge_at(jc, jb, 0)], jk,
+                cell_edge_blk_h[cell_edge_at(jc, jb, 0)])] *
+                geofac_div_h[geofac_div_at(jc, 0, jb)] +
+            opt_in2_h[vec_e_at(
+                cell_edge_idx_h[cell_edge_at(jc, jb, 1)], jk,
+                cell_edge_blk_h[cell_edge_at(jc, jb, 1)])] *
+                geofac_div_h[geofac_div_at(jc, 1, jb)] +
+            opt_in2_h[vec_e_at(
+                cell_edge_idx_h[cell_edge_at(jc, jb, 2)], jk,
+                cell_edge_blk_h[cell_edge_at(jc, jb, 2)])] *
+                geofac_div_h[geofac_div_at(jc, 2, jb)];
       }
     }
   }
@@ -872,35 +925,35 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgRandom) {
       for (int jc = i_startidx; jc < i_endidx; ++jc) {
         ref_div_vec_c[div_vec_c_at(jc, jk, jb)] =
             aux_c[div_vec_c_at(jc, jk, jb)] *
-                this->avg_coeff[avg_coeff_at(jc, 0, jb)] +
+                avg_coeff_h[avg_coeff_at(jc, 0, jb)] +
             aux_c[div_vec_c_at(
-                this->cell_neighbor_idx[cell_neighbor_at(jc, jb, 0)], jk,
-                this->cell_neighbor_blk[cell_neighbor_at(jc, jb, 0)])] *
-                this->avg_coeff[avg_coeff_at(jc, 1, jb)] +
+                cell_neighbor_idx_h[cell_neighbor_at(jc, jb, 0)], jk,
+                cell_neighbor_blk_h[cell_neighbor_at(jc, jb, 0)])] *
+                avg_coeff_h[avg_coeff_at(jc, 1, jb)] +
             aux_c[div_vec_c_at(
-                this->cell_neighbor_idx[cell_neighbor_at(jc, jb, 1)], jk,
-                this->cell_neighbor_blk[cell_neighbor_at(jc, jb, 1)])] *
-                this->avg_coeff[avg_coeff_at(jc, 2, jb)] +
+                cell_neighbor_idx_h[cell_neighbor_at(jc, jb, 1)], jk,
+                cell_neighbor_blk_h[cell_neighbor_at(jc, jb, 1)])] *
+                avg_coeff_h[avg_coeff_at(jc, 2, jb)] +
             aux_c[div_vec_c_at(
-                this->cell_neighbor_idx[cell_neighbor_at(jc, jb, 2)], jk,
-                this->cell_neighbor_blk[cell_neighbor_at(jc, jb, 2)])] *
-                this->avg_coeff[avg_coeff_at(jc, 3, jb)];
+                cell_neighbor_idx_h[cell_neighbor_at(jc, jb, 2)], jk,
+                cell_neighbor_blk_h[cell_neighbor_at(jc, jb, 2)])] *
+                avg_coeff_h[avg_coeff_at(jc, 3, jb)];
 
         ref_opt_out2[div_vec_c_at(jc, jk, jb)] =
             aux_c2[div_vec_c_at(jc, jk, jb)] *
-                this->avg_coeff[avg_coeff_at(jc, 0, jb)] +
+                avg_coeff_h[avg_coeff_at(jc, 0, jb)] +
             aux_c2[div_vec_c_at(
-                this->cell_neighbor_idx[cell_neighbor_at(jc, jb, 0)], jk,
-                this->cell_neighbor_blk[cell_neighbor_at(jc, jb, 0)])] *
-                this->avg_coeff[avg_coeff_at(jc, 1, jb)] +
+                cell_neighbor_idx_h[cell_neighbor_at(jc, jb, 0)], jk,
+                cell_neighbor_blk_h[cell_neighbor_at(jc, jb, 0)])] *
+                avg_coeff_h[avg_coeff_at(jc, 1, jb)] +
             aux_c2[div_vec_c_at(
-                this->cell_neighbor_idx[cell_neighbor_at(jc, jb, 1)], jk,
-                this->cell_neighbor_blk[cell_neighbor_at(jc, jb, 1)])] *
-                this->avg_coeff[avg_coeff_at(jc, 2, jb)] +
+                cell_neighbor_idx_h[cell_neighbor_at(jc, jb, 1)], jk,
+                cell_neighbor_blk_h[cell_neighbor_at(jc, jb, 1)])] *
+                avg_coeff_h[avg_coeff_at(jc, 2, jb)] +
             aux_c2[div_vec_c_at(
-                this->cell_neighbor_idx[cell_neighbor_at(jc, jb, 2)], jk,
-                this->cell_neighbor_blk[cell_neighbor_at(jc, jb, 2)])] *
-                this->avg_coeff[avg_coeff_at(jc, 3, jb)];
+                cell_neighbor_idx_h[cell_neighbor_at(jc, jb, 2)], jk,
+                cell_neighbor_blk_h[cell_neighbor_at(jc, jb, 2)])] *
+                avg_coeff_h[avg_coeff_at(jc, 3, jb)];
       }
     }
   }
@@ -908,11 +961,11 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgRandom) {
   // Verify results
   for (int i = 0; i < nproma; ++i) {
     for (int k = 0; k < nlev; ++k) {
-      EXPECT_NEAR(this->div_vec_c[div_vec_c_at(i, k, 0)],
+      EXPECT_NEAR(div_vec_c_h[div_vec_c_at(i, k, 0)],
                   ref_div_vec_c[div_vec_c_at(i, k, 0)], 1e-5)
           << "div_vec_c results differ at i=" << i << ", k=" << k;
 
-      EXPECT_NEAR(this->opt_out2[div_vec_c_at(i, k, 0)],
+      EXPECT_NEAR(opt_out2_h[div_vec_c_at(i, k, 0)],
                   ref_opt_out2[div_vec_c_at(i, k, 0)], 1e-5)
           << "opt_out2 results differ at i=" << i << ", k=" << k;
     }
@@ -945,48 +998,71 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgSpecificNoL2fields) {
   const auto &cell_neighbor_at = at<nproma, nblks_c, 3>;
   const auto &avg_coeff_at = at<nproma, 4, nblks_c>;
 
+  // Create mirror views to store data on the host
+  auto vec_e_h = Kokkos::create_mirror_view(this->vec_e);
+  auto cell_neighbor_idx_h = Kokkos::create_mirror_view(this->cell_neighbor_idx);
+  auto cell_neighbor_blk_h = Kokkos::create_mirror_view(this->cell_neighbor_blk);
+  auto cell_edge_idx_h = Kokkos::create_mirror_view(this->cell_edge_idx);
+  auto cell_edge_blk_h = Kokkos::create_mirror_view(this->cell_edge_blk);
+  auto geofac_div_h = Kokkos::create_mirror_view(this->geofac_div);
+  auto avg_coeff_h = Kokkos::create_mirror_view(this->avg_coeff);
+  auto div_vec_c_h = Kokkos::create_mirror_view(this->div_vec_c);
+  auto opt_in2_h = Kokkos::create_mirror_view(this->opt_in2);
+  auto opt_out2_h = Kokkos::create_mirror_view(this->opt_out2);
+
   // Initialize the vectors with specific values
   for (int i = 0; i < nproma; ++i) {
     for (int k = 0; k < nlev; ++k) {
-      this->vec_e[vec_e_at(i, k, 0)] = (i + 1) * (k + 1); // Simple pattern
-      this->opt_in2[vec_e_at(i, k, 0)] =
-          (i + 1) * (k + 1) * 0.5; // Half of vec_e
+      vec_e_h[vec_e_at(i, k, 0)] = static_cast<TypeParam>((i + 1) * (k + 1)); // Simple pattern
+      opt_in2_h[vec_e_at(i, k, 0)] = static_cast<TypeParam>((i + 1) * (k + 1) * 0.5); // Half of vec_e
     }
 
     // Set edge indices to point to specific cells
-    this->cell_edge_idx[cell_edge_at(i, 0, 0)] = i;
-    this->cell_edge_idx[cell_edge_at(i, 0, 1)] = (i + 1) % nproma;
-    this->cell_edge_idx[cell_edge_at(i, 0, 2)] = (i + 2) % nproma;
+    cell_edge_idx_h[cell_edge_at(i, 0, 0)] = i;
+    cell_edge_idx_h[cell_edge_at(i, 0, 1)] = (i + 1) % nproma;
+    cell_edge_idx_h[cell_edge_at(i, 0, 2)] = (i + 2) % nproma;
 
     // Set neighbor indices similarly
-    this->cell_neighbor_idx[cell_neighbor_at(i, 0, 0)] = i;
-    this->cell_neighbor_idx[cell_neighbor_at(i, 0, 1)] = (i + 1) % nproma;
-    this->cell_neighbor_idx[cell_neighbor_at(i, 0, 2)] = (i + 2) % nproma;
+    cell_neighbor_idx_h[cell_neighbor_at(i, 0, 0)] = i;
+    cell_neighbor_idx_h[cell_neighbor_at(i, 0, 1)] = (i + 1) % nproma;
+    cell_neighbor_idx_h[cell_neighbor_at(i, 0, 2)] = (i + 2) % nproma;
 
     // All edges and neighbors are in the same block for this test
     for (int j = 0; j < 3; ++j) {
-      this->cell_edge_blk[cell_edge_at(i, 0, j)] = 0;
-      this->cell_neighbor_blk[cell_neighbor_at(i, 0, j)] = 0;
+      cell_edge_blk_h[cell_edge_at(i, 0, j)] = 0;
+      cell_neighbor_blk_h[cell_neighbor_at(i, 0, j)] = 0;
     }
 
     // Geometric factors
-    this->geofac_div[geofac_div_at(i, 0, 0)] = 0.5;
-    this->geofac_div[geofac_div_at(i, 1, 0)] = 0.3;
-    this->geofac_div[geofac_div_at(i, 2, 0)] = 0.2;
+    geofac_div_h[geofac_div_at(i, 0, 0)] = static_cast<TypeParam>(0.5);
+    geofac_div_h[geofac_div_at(i, 1, 0)] = static_cast<TypeParam>(0.3);
+    geofac_div_h[geofac_div_at(i, 2, 0)] = static_cast<TypeParam>(0.2);
 
     // Average coefficients
-    this->avg_coeff[avg_coeff_at(i, 0, 0)] = 0.4; // Self
-    this->avg_coeff[avg_coeff_at(i, 1, 0)] = 0.2; // First neighbor
-    this->avg_coeff[avg_coeff_at(i, 2, 0)] = 0.2; // Second neighbor
-    this->avg_coeff[avg_coeff_at(i, 3, 0)] = 0.2; // Third neighbor
+    avg_coeff_h[avg_coeff_at(i, 0, 0)] = static_cast<TypeParam>(0.4); // Self
+    avg_coeff_h[avg_coeff_at(i, 1, 0)] = static_cast<TypeParam>(0.2); // First neighbor
+    avg_coeff_h[avg_coeff_at(i, 2, 0)] = static_cast<TypeParam>(0.2); // Second neighbor
+    avg_coeff_h[avg_coeff_at(i, 3, 0)] = static_cast<TypeParam>(0.2); // Third neighbor
 
     // Initialize div_vec_c and opt_out2 to zero
     for (int k = 0; k < nlev; ++k) {
-      this->div_vec_c[div_vec_c_at(i, k, 0)] = 0.0;
-      this->opt_out2[div_vec_c_at(i, k, 0)] = 0.0;
+      div_vec_c_h[div_vec_c_at(i, k, 0)] = static_cast<TypeParam>(0.0);
+      opt_out2_h[div_vec_c_at(i, k, 0)] = static_cast<TypeParam>(0.0);
     }
   }
 
+  // Copy initialized data to device
+  Kokkos::deep_copy(this->vec_e, vec_e_h);
+  Kokkos::deep_copy(this->cell_neighbor_idx, cell_neighbor_idx_h);
+  Kokkos::deep_copy(this->cell_neighbor_blk, cell_neighbor_blk_h);
+  Kokkos::deep_copy(this->cell_edge_idx, cell_edge_idx_h);
+  Kokkos::deep_copy(this->cell_edge_blk, cell_edge_blk_h);
+  Kokkos::deep_copy(this->geofac_div, geofac_div_h);
+  Kokkos::deep_copy(this->avg_coeff, avg_coeff_h);
+  Kokkos::deep_copy(this->div_vec_c, div_vec_c_h);
+  Kokkos::deep_copy(this->opt_in2, opt_in2_h);
+  Kokkos::deep_copy(this->opt_out2, opt_out2_h);
+
   // Call the div_avg function
   div_avg<TypeParam>(
       this->vec_e.data(), this->cell_neighbor_idx.data(),
@@ -998,19 +1074,25 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgSpecificNoL2fields) {
       this->nproma, patch_id, l_limited_area, l2fields, this->lacc, this->nlev,
       this->nblks_c, this->nblks_e);
 
-  EXPECT_NEAR(this->div_vec_c[div_vec_c_at(0, 0, 0)], 1.88, 1e-6);
-  EXPECT_NEAR(this->div_vec_c[div_vec_c_at(0, 1, 0)], 3.76, 1e-6);
-  EXPECT_NEAR(this->div_vec_c[div_vec_c_at(1, 0, 0)], 2.04, 1e-6);
-  EXPECT_NEAR(this->div_vec_c[div_vec_c_at(1, 1, 0)], 4.08, 1e-6);
-  EXPECT_NEAR(this->div_vec_c[div_vec_c_at(2, 0, 0)], 2.08, 1e-6);
-  EXPECT_NEAR(this->div_vec_c[div_vec_c_at(2, 1, 0)], 4.16, 1e-6);
-
-  EXPECT_NEAR(this->opt_out2[div_vec_c_at(0, 0, 0)], 0.0, 1e-6);
-  EXPECT_NEAR(this->opt_out2[div_vec_c_at(0, 1, 0)], 0.0, 1e-6);
-  EXPECT_NEAR(this->opt_out2[div_vec_c_at(1, 0, 0)], 0.0, 1e-6);
-  EXPECT_NEAR(this->opt_out2[div_vec_c_at(1, 1, 0)], 0.0, 1e-6);
-  EXPECT_NEAR(this->opt_out2[div_vec_c_at(2, 0, 0)], 0.0, 1e-6);
-  EXPECT_NEAR(this->opt_out2[div_vec_c_at(2, 1, 0)], 0.0, 1e-6);
+  // Copy results back to host for verification
+  Kokkos::deep_copy(div_vec_c_h, this->div_vec_c);
+  Kokkos::deep_copy(opt_out2_h, this->opt_out2);
+
+  // Verify first field results
+  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(0, 0, 0)], static_cast<TypeParam>(1.88), 1e-6);
+  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(0, 1, 0)], static_cast<TypeParam>(3.76), 1e-6);
+  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(1, 0, 0)], static_cast<TypeParam>(2.04), 1e-6);
+  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(1, 1, 0)], static_cast<TypeParam>(4.08), 1e-6);
+  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(2, 0, 0)], static_cast<TypeParam>(2.08), 1e-6);
+  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(2, 1, 0)], static_cast<TypeParam>(4.16), 1e-6);
+
+  // Since l2fields=false, opt_out2 should not be modified
+  EXPECT_NEAR(opt_out2_h[div_vec_c_at(0, 0, 0)], static_cast<TypeParam>(0.0), 1e-6);
+  EXPECT_NEAR(opt_out2_h[div_vec_c_at(0, 1, 0)], static_cast<TypeParam>(0.0), 1e-6);
+  EXPECT_NEAR(opt_out2_h[div_vec_c_at(1, 0, 0)], static_cast<TypeParam>(0.0), 1e-6);
+  EXPECT_NEAR(opt_out2_h[div_vec_c_at(1, 1, 0)], static_cast<TypeParam>(0.0), 1e-6);
+  EXPECT_NEAR(opt_out2_h[div_vec_c_at(2, 0, 0)], static_cast<TypeParam>(0.0), 1e-6);
+  EXPECT_NEAR(opt_out2_h[div_vec_c_at(2, 1, 0)], static_cast<TypeParam>(0.0), 1e-6);
 }
 
 TYPED_TEST(HorizontalDivTest, TestDivAvgRandomNoL2fields) {
@@ -1038,49 +1120,69 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgRandomNoL2fields) {
   const auto &cell_neighbor_at = at<nproma, nblks_c, 3>;
   const auto &avg_coeff_at = at<nproma, 4, nblks_c>;
 
+  // Create mirror views to store data on the host
+  auto vec_e_h = Kokkos::create_mirror_view(this->vec_e);
+  auto cell_neighbor_idx_h = Kokkos::create_mirror_view(this->cell_neighbor_idx);
+  auto cell_neighbor_blk_h = Kokkos::create_mirror_view(this->cell_neighbor_blk);
+  auto cell_edge_idx_h = Kokkos::create_mirror_view(this->cell_edge_idx);
+  auto cell_edge_blk_h = Kokkos::create_mirror_view(this->cell_edge_blk);
+  auto geofac_div_h = Kokkos::create_mirror_view(this->geofac_div);
+  auto avg_coeff_h = Kokkos::create_mirror_view(this->avg_coeff);
+  auto div_vec_c_h = Kokkos::create_mirror_view(this->div_vec_c);
+  auto opt_in2_h = Kokkos::create_mirror_view(this->opt_in2);
+  auto opt_out2_h = Kokkos::create_mirror_view(this->opt_out2);
+
   // Set up random number generators
   std::random_device rd;
   std::mt19937 gen(rd());
   std::uniform_int_distribution<int> int_distrib(0, nproma - 1);
-  std::uniform_real_distribution<TypeParam> real_distrib(-10.0, 10.0);
+  std::uniform_real_distribution<TypeParam> real_distrib(-1.0, 1.0);
 
   // Initialize with random values
   for (int i = 0; i < nproma; ++i) {
     for (int k = 0; k < nlev; ++k) {
-      this->vec_e[vec_e_at(i, k, 0)] = real_distrib(gen);
-      this->opt_in2[vec_e_at(i, k, 0)] =
-          real_distrib(gen); // Not used but initialize anyway
+      vec_e_h[vec_e_at(i, k, 0)] = real_distrib(gen);
+      opt_in2_h[vec_e_at(i, k, 0)] = real_distrib(gen); // Not used but initialize anyway
     }
 
     // Set random edge indices
     for (int j = 0; j < 3; ++j) {
-      this->cell_edge_idx[cell_edge_at(i, 0, j)] = int_distrib(gen);
-      this->cell_edge_blk[cell_edge_at(i, 0, j)] =
-          0; // Keep in same block for simplicity
+      cell_edge_idx_h[cell_edge_at(i, 0, j)] = int_distrib(gen);
+      cell_edge_blk_h[cell_edge_at(i, 0, j)] = 0; // Keep in same block for simplicity
 
-      this->cell_neighbor_idx[cell_neighbor_at(i, 0, j)] = int_distrib(gen);
-      this->cell_neighbor_blk[cell_neighbor_at(i, 0, j)] =
-          0; // Keep in same block for simplicity
+      cell_neighbor_idx_h[cell_neighbor_at(i, 0, j)] = int_distrib(gen);
+      cell_neighbor_blk_h[cell_neighbor_at(i, 0, j)] = 0; // Keep in same block for simplicity
     }
 
     // Random geometric factors
     for (int j = 0; j < 3; ++j) {
-      this->geofac_div[geofac_div_at(i, j, 0)] = real_distrib(gen);
+      geofac_div_h[geofac_div_at(i, j, 0)] = real_distrib(gen);
     }
 
     // Random average coefficients
     for (int j = 0; j < 4; ++j) {
-      this->avg_coeff[avg_coeff_at(i, j, 0)] = real_distrib(gen);
+      avg_coeff_h[avg_coeff_at(i, j, 0)] = real_distrib(gen);
     }
 
     // Random initial values for div_vec_c and opt_out2
     for (int k = 0; k < nlev; ++k) {
-      this->div_vec_c[div_vec_c_at(i, k, 0)] = real_distrib(gen);
-      this->opt_out2[div_vec_c_at(i, k, 0)] =
-          real_distrib(gen); // Not used but initialize anyway
+      div_vec_c_h[div_vec_c_at(i, k, 0)] = static_cast<TypeParam>(0.0);
+      opt_out2_h[div_vec_c_at(i, k, 0)] = static_cast<TypeParam>(0.0); // Not used but initialize anyway
     }
   }
 
+  // Copy initialized data to device
+  Kokkos::deep_copy(this->vec_e, vec_e_h);
+  Kokkos::deep_copy(this->cell_neighbor_idx, cell_neighbor_idx_h);
+  Kokkos::deep_copy(this->cell_neighbor_blk, cell_neighbor_blk_h);
+  Kokkos::deep_copy(this->cell_edge_idx, cell_edge_idx_h);
+  Kokkos::deep_copy(this->cell_edge_blk, cell_edge_blk_h);
+  Kokkos::deep_copy(this->geofac_div, geofac_div_h);
+  Kokkos::deep_copy(this->avg_coeff, avg_coeff_h);
+  Kokkos::deep_copy(this->div_vec_c, div_vec_c_h);
+  Kokkos::deep_copy(this->opt_in2, opt_in2_h);
+  Kokkos::deep_copy(this->opt_out2, opt_out2_h);
+
   // Call the div_avg function with l2fields=false
   div_avg<TypeParam>(
       this->vec_e.data(), this->cell_neighbor_idx.data(),
@@ -1092,6 +1194,10 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgRandomNoL2fields) {
       this->nproma, patch_id, l_limited_area, l2fields, this->lacc, this->nlev,
       this->nblks_c, this->nblks_e);
 
+  // Copy results back to host for verification
+  Kokkos::deep_copy(div_vec_c_h, this->div_vec_c);
+  Kokkos::deep_copy(opt_out2_h, this->opt_out2);
+
   // Calculate reference values manually
   std::vector<TypeParam> aux_c(dim_combine(nproma, nlev, nblks_c));
   std::vector<TypeParam> ref_div_vec_c(dim_combine(nproma, nlev, nblks_c));
@@ -1105,18 +1211,18 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgRandomNoL2fields) {
     for (int jk = this->slev[0]; jk < this->elev[0]; ++jk) {
       for (int jc = i_startidx; jc < i_endidx; ++jc) {
         aux_c[div_vec_c_at(jc, jk, jb)] =
-            this->vec_e[vec_e_at(
-                this->cell_edge_idx[cell_edge_at(jc, jb, 0)], jk,
-                this->cell_edge_blk[cell_edge_at(jc, jb, 0)])] *
-                this->geofac_div[geofac_div_at(jc, 0, jb)] +
-            this->vec_e[vec_e_at(
-                this->cell_edge_idx[cell_edge_at(jc, jb, 1)], jk,
-                this->cell_edge_blk[cell_edge_at(jc, jb, 1)])] *
-                this->geofac_div[geofac_div_at(jc, 1, jb)] +
-            this->vec_e[vec_e_at(
-                this->cell_edge_idx[cell_edge_at(jc, jb, 2)], jk,
-                this->cell_edge_blk[cell_edge_at(jc, jb, 2)])] *
-                this->geofac_div[geofac_div_at(jc, 2, jb)];
+            vec_e_h[vec_e_at(
+                cell_edge_idx_h[cell_edge_at(jc, jb, 0)], jk,
+                cell_edge_blk_h[cell_edge_at(jc, jb, 0)])] *
+                geofac_div_h[geofac_div_at(jc, 0, jb)] +
+            vec_e_h[vec_e_at(
+                cell_edge_idx_h[cell_edge_at(jc, jb, 1)], jk,
+                cell_edge_blk_h[cell_edge_at(jc, jb, 1)])] *
+                geofac_div_h[geofac_div_at(jc, 1, jb)] +
+            vec_e_h[vec_e_at(
+                cell_edge_idx_h[cell_edge_at(jc, jb, 2)], jk,
+                cell_edge_blk_h[cell_edge_at(jc, jb, 2)])] *
+                geofac_div_h[geofac_div_at(jc, 2, jb)];
       }
     }
   }
@@ -1147,19 +1253,19 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgRandomNoL2fields) {
       for (int jc = i_startidx; jc < i_endidx; ++jc) {
         ref_div_vec_c[div_vec_c_at(jc, jk, jb)] =
             aux_c[div_vec_c_at(jc, jk, jb)] *
-                this->avg_coeff[avg_coeff_at(jc, 0, jb)] +
+                avg_coeff_h[avg_coeff_at(jc, 0, jb)] +
             aux_c[div_vec_c_at(
-                this->cell_neighbor_idx[cell_neighbor_at(jc, jb, 0)], jk,
-                this->cell_neighbor_blk[cell_neighbor_at(jc, jb, 0)])] *
-                this->avg_coeff[avg_coeff_at(jc, 1, jb)] +
+                cell_neighbor_idx_h[cell_neighbor_at(jc, jb, 0)], jk,
+                cell_neighbor_blk_h[cell_neighbor_at(jc, jb, 0)])] *
+                avg_coeff_h[avg_coeff_at(jc, 1, jb)] +
             aux_c[div_vec_c_at(
-                this->cell_neighbor_idx[cell_neighbor_at(jc, jb, 1)], jk,
-                this->cell_neighbor_blk[cell_neighbor_at(jc, jb, 1)])] *
-                this->avg_coeff[avg_coeff_at(jc, 2, jb)] +
+                cell_neighbor_idx_h[cell_neighbor_at(jc, jb, 1)], jk,
+                cell_neighbor_blk_h[cell_neighbor_at(jc, jb, 1)])] *
+                avg_coeff_h[avg_coeff_at(jc, 2, jb)] +
             aux_c[div_vec_c_at(
-                this->cell_neighbor_idx[cell_neighbor_at(jc, jb, 2)], jk,
-                this->cell_neighbor_blk[cell_neighbor_at(jc, jb, 2)])] *
-                this->avg_coeff[avg_coeff_at(jc, 3, jb)];
+                cell_neighbor_idx_h[cell_neighbor_at(jc, jb, 2)], jk,
+                cell_neighbor_blk_h[cell_neighbor_at(jc, jb, 2)])] *
+                avg_coeff_h[avg_coeff_at(jc, 3, jb)];
       }
     }
   }
@@ -1168,10 +1274,9 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgRandomNoL2fields) {
   // isn't updated
   for (int i = 0; i < nproma; ++i) {
     for (int k = 0; k < nlev; ++k) {
-      EXPECT_NEAR(this->div_vec_c[div_vec_c_at(i, k, 0)],
+      EXPECT_NEAR(div_vec_c_h[div_vec_c_at(i, k, 0)],
                   ref_div_vec_c[div_vec_c_at(i, k, 0)], 1e-5)
           << "div_vec_c results differ at i=" << i << ", k=" << k;
     }
   }
 }
-*/
-- 
GitLab


From b43bd02f503163939a1e0c2fac085dc37a00b249 Mon Sep 17 00:00:00 2001
From: Pradipta Samanta <samanta@dkrz.de>
Date: Thu, 27 Mar 2025 11:57:42 +0100
Subject: [PATCH 06/34] modified the tests in horizontal_recon

---
 src/horizontal/mo_lib_divrot.cpp |  714 ++++++++++-----------
 test/c/CMakeLists.txt            |    2 +-
 test/c/test_horizontal_recon.cpp | 1025 ++++++++++++++++++++----------
 3 files changed, 1042 insertions(+), 699 deletions(-)

diff --git a/src/horizontal/mo_lib_divrot.cpp b/src/horizontal/mo_lib_divrot.cpp
index 67d13fd..d460211 100644
--- a/src/horizontal/mo_lib_divrot.cpp
+++ b/src/horizontal/mo_lib_divrot.cpp
@@ -36,9 +36,6 @@ void recon_lsq_cell_l(const T *p_cc, const int *cell_neighbor_idx,
                        Kokkos::MemoryUnmanaged>
       UnmanagedConstInt3D;
 
-  Kokkos::View<T *> z_d("z_d", lsq_dim_c);
-  Kokkos::View<T *> z_qt_times_d("z_qt_times_d", lsq_dim_unk);
-
   UnmanagedConstInt3D iidx(cell_neighbor_idx, nproma, nblks_c, lsq_dim_c);
   UnmanagedConstInt3D iblk(cell_neighbor_blk, nproma, nblks_c, lsq_dim_c);
 
@@ -64,25 +61,28 @@ void recon_lsq_cell_l(const T *p_cc, const int *cell_neighbor_idx,
     Kokkos::parallel_for(
         "recon_lsq_cell_l_inner", innerPolicy,
         KOKKOS_LAMBDA(const int jk, const int jc) {
-          z_d(0) = p_cc_view(iidx(jc, jb, 0), jk, iblk(jc, jb, 0)) -
+          T z_d[3]; // Local array instead of shared View
+          T z_qt_times_d[2];
+
+          z_d[0] = p_cc_view(iidx(jc, jb, 0), jk, iblk(jc, jb, 0)) -
                    p_cc_view(jc, jk, jb);
-          z_d(1) = p_cc_view(iidx(jc, jb, 1), jk, iblk(jc, jb, 1)) -
+          z_d[1] = p_cc_view(iidx(jc, jb, 1), jk, iblk(jc, jb, 1)) -
                    p_cc_view(jc, jk, jb);
-          z_d(2) = p_cc_view(iidx(jc, jb, 2), jk, iblk(jc, jb, 2)) -
+          z_d[2] = p_cc_view(iidx(jc, jb, 2), jk, iblk(jc, jb, 2)) -
                    p_cc_view(jc, jk, jb);
           // matrix multiplication Q^T d (partitioned into 2 dot products)
-          z_qt_times_d(0) = lsq_qtmat_c_view(jc, 0, 0, jb) * z_d(0) +
-                            lsq_qtmat_c_view(jc, 0, 1, jb) * z_d(1) +
-                            lsq_qtmat_c_view(jc, 0, 2, jb) * z_d(2);
-          z_qt_times_d(1) = lsq_qtmat_c_view(jc, 1, 0, jb) * z_d(0) +
-                            lsq_qtmat_c_view(jc, 1, 1, jb) * z_d(1) +
-                            lsq_qtmat_c_view(jc, 1, 2, jb) * z_d(2);
+          z_qt_times_d[0] = lsq_qtmat_c_view(jc, 0, 0, jb) * z_d[0] +
+                            lsq_qtmat_c_view(jc, 0, 1, jb) * z_d[1] +
+                            lsq_qtmat_c_view(jc, 0, 2, jb) * z_d[2];
+          z_qt_times_d[1] = lsq_qtmat_c_view(jc, 1, 0, jb) * z_d[0] +
+                            lsq_qtmat_c_view(jc, 1, 1, jb) * z_d[1] +
+                            lsq_qtmat_c_view(jc, 1, 2, jb) * z_d[2];
 
           p_coeff_view(2, jc, jk, jb) =
-              lsq_rmat_rdiag_c_view(jc, 1, jb) * z_qt_times_d(1);
+              lsq_rmat_rdiag_c_view(jc, 1, jb) * z_qt_times_d[1];
           p_coeff_view(1, jc, jk, jb) =
               lsq_rmat_rdiag_c_view(jc, 0, jb) *
-              (z_qt_times_d(0) -
+              (z_qt_times_d[0] -
                lsq_rmat_utri_c_view(jc, 0, jb) * p_coeff_view(2, jc, jk, jb));
           p_coeff_view(0, jc, jk, jb) = p_cc_view(jc, jk, jb);
         });
@@ -124,8 +124,6 @@ void recon_lsq_cell_l_svd(const T *p_cc, const int *cell_neighbor_idx,
                        Kokkos::MemoryUnmanaged>
       UnmanagedConstInt3D;
 
-  Kokkos::View<T *> z_b("z_b", lsq_dim_c);
-
   UnmanagedConstInt3D iidx(cell_neighbor_idx, nproma, nblks_c, lsq_dim_c);
   UnmanagedConstInt3D iblk(cell_neighbor_blk, nproma, nblks_c, lsq_dim_c);
 
@@ -146,21 +144,22 @@ void recon_lsq_cell_l_svd(const T *p_cc, const int *cell_neighbor_idx,
     Kokkos::parallel_for(
         "recon_lsq_cell_l_svd_inner", innerPolicy,
         KOKKOS_LAMBDA(const int jk, const int jc) {
-          z_b(0) = p_cc_view(iidx(jc, jb, 0), jk, iblk(jc, jb, 0)) -
+          T z_b[3]; // Local array instead of shared View
+          z_b[0] = p_cc_view(iidx(jc, jb, 0), jk, iblk(jc, jb, 0)) -
                    p_cc_view(jc, jk, jb);
-          z_b(1) = p_cc_view(iidx(jc, jb, 1), jk, iblk(jc, jb, 1)) -
+          z_b[1] = p_cc_view(iidx(jc, jb, 1), jk, iblk(jc, jb, 1)) -
                    p_cc_view(jc, jk, jb);
-          z_b(2) = p_cc_view(iidx(jc, jb, 2), jk, iblk(jc, jb, 2)) -
+          z_b[2] = p_cc_view(iidx(jc, jb, 2), jk, iblk(jc, jb, 2)) -
                    p_cc_view(jc, jk, jb);
 
           p_coeff_view(2, jc, jk, jb) =
-              lsq_pseudoinv_view(jc, 1, 0, jb) * z_b(0) +
-              lsq_pseudoinv_view(jc, 1, 1, jb) * z_b(1) +
-              lsq_pseudoinv_view(jc, 1, 2, jb) * z_b(2);
+              lsq_pseudoinv_view(jc, 1, 0, jb) * z_b[0] +
+              lsq_pseudoinv_view(jc, 1, 1, jb) * z_b[1] +
+              lsq_pseudoinv_view(jc, 1, 2, jb) * z_b[2];
           p_coeff_view(1, jc, jk, jb) =
-              lsq_pseudoinv_view(jc, 0, 0, jb) * z_b(0) +
-              lsq_pseudoinv_view(jc, 0, 1, jb) * z_b(1) +
-              lsq_pseudoinv_view(jc, 0, 2, jb) * z_b(2);
+              lsq_pseudoinv_view(jc, 0, 0, jb) * z_b[0] +
+              lsq_pseudoinv_view(jc, 0, 1, jb) * z_b[1] +
+              lsq_pseudoinv_view(jc, 0, 2, jb) * z_b[2];
           p_coeff_view(0, jc, jk, jb) = p_cc_view(jc, jk, jb);
         });
     if (l_consv) {
@@ -201,8 +200,8 @@ void recon_lsq_cell_q(const T *p_cc, const int *lsq_idx_c, const int *lsq_blk_c,
                        Kokkos::MemoryUnmanaged>
       UnmanagedConstInt3D;
 
-  Kokkos::View<T ***> z_d("z_d", lsq_dim_c, nproma, nlev);
-  Kokkos::View<T *> z_qt_times_d("z_qt_times_d", lsq_dim_unk);
+  // Kokkos::View<T ***> z_d("z_d", lsq_dim_c, nproma, nlev);
+  // Kokkos::View<T *> z_qt_times_d("z_qt_times_d", lsq_dim_unk);
 
   UnmanagedConstInt3D iidx(lsq_idx_c, nproma, nblks_c, lsq_dim_c);
   UnmanagedConstInt3D iblk(lsq_blk_c, nproma, nblks_c, lsq_dim_c);
@@ -239,93 +238,95 @@ void recon_lsq_cell_q(const T *p_cc, const int *lsq_idx_c, const int *lsq_blk_c,
     Kokkos::parallel_for(
         "recon_lsq_cell_q_step1", innerPolicy,
         KOKKOS_LAMBDA(const int jk, const int jc) {
-          z_d(0, jc, jk) = p_cc_view(iidx(jc, jb, 0), jk, iblk(jc, jb, 0)) -
-                           p_cc_view(jc, jk, jb);
-          z_d(1, jc, jk) = p_cc_view(iidx(jc, jb, 1), jk, iblk(jc, jb, 1)) -
-                           p_cc_view(jc, jk, jb);
-          z_d(2, jc, jk) = p_cc_view(iidx(jc, jb, 2), jk, iblk(jc, jb, 2)) -
-                           p_cc_view(jc, jk, jb);
-          z_d(3, jc, jk) = p_cc_view(iidx(jc, jb, 3), jk, iblk(jc, jb, 3)) -
-                           p_cc_view(jc, jk, jb);
-          z_d(4, jc, jk) = p_cc_view(iidx(jc, jb, 4), jk, iblk(jc, jb, 4)) -
-                           p_cc_view(jc, jk, jb);
-          z_d(5, jc, jk) = p_cc_view(iidx(jc, jb, 5), jk, iblk(jc, jb, 5)) -
-                           p_cc_view(jc, jk, jb);
-          z_d(6, jc, jk) = p_cc_view(iidx(jc, jb, 6), jk, iblk(jc, jb, 6)) -
-                           p_cc_view(jc, jk, jb);
-          z_d(7, jc, jk) = p_cc_view(iidx(jc, jb, 7), jk, iblk(jc, jb, 7)) -
-                           p_cc_view(jc, jk, jb);
-          z_d(8, jc, jk) = p_cc_view(iidx(jc, jb, 8), jk, iblk(jc, jb, 8)) -
+          T z_d[9]; // Local array instead of shared View
+          T z_qt_times_d[5];
+          z_d[0] = p_cc_view(iidx(jc, jb, 0), jk, iblk(jc, jb, 0)) -
+                   p_cc_view(jc, jk, jb);
+          z_d[1] = p_cc_view(iidx(jc, jb, 1), jk, iblk(jc, jb, 1)) -
+                   p_cc_view(jc, jk, jb);
+          z_d[2] = p_cc_view(iidx(jc, jb, 2), jk, iblk(jc, jb, 2)) -
+                   p_cc_view(jc, jk, jb);
+          z_d[3] = p_cc_view(iidx(jc, jb, 3), jk, iblk(jc, jb, 3)) -
+                   p_cc_view(jc, jk, jb);
+          z_d[4] = p_cc_view(iidx(jc, jb, 4), jk, iblk(jc, jb, 4)) -
+                   p_cc_view(jc, jk, jb);
+          z_d[5] = p_cc_view(iidx(jc, jb, 5), jk, iblk(jc, jb, 5)) -
+                   p_cc_view(jc, jk, jb);
+          z_d[6] = p_cc_view(iidx(jc, jb, 6), jk, iblk(jc, jb, 6)) -
+                   p_cc_view(jc, jk, jb);
+          z_d[7] = p_cc_view(iidx(jc, jb, 7), jk, iblk(jc, jb, 7)) -
+                   p_cc_view(jc, jk, jb);
+          z_d[8] = p_cc_view(iidx(jc, jb, 8), jk, iblk(jc, jb, 8)) -
                            p_cc_view(jc, jk, jb);
-        });
-    Kokkos::parallel_for(
-        "recon_lsq_cell_q_step2", innerPolicy,
-        KOKKOS_LAMBDA(const int jk, const int jc) {
-          z_qt_times_d(0) = lsq_qtmat_c_view(jc, 0, 0, jb) * z_d(0, jc, jk) +
-                            lsq_qtmat_c_view(jc, 0, 1, jb) * z_d(1, jc, jk) +
-                            lsq_qtmat_c_view(jc, 0, 2, jb) * z_d(2, jc, jk) +
-                            lsq_qtmat_c_view(jc, 0, 3, jb) * z_d(3, jc, jk) +
-                            lsq_qtmat_c_view(jc, 0, 4, jb) * z_d(4, jc, jk) +
-                            lsq_qtmat_c_view(jc, 0, 5, jb) * z_d(5, jc, jk) +
-                            lsq_qtmat_c_view(jc, 0, 6, jb) * z_d(6, jc, jk) +
-                            lsq_qtmat_c_view(jc, 0, 7, jb) * z_d(7, jc, jk) +
-                            lsq_qtmat_c_view(jc, 0, 8, jb) * z_d(8, jc, jk);
-          z_qt_times_d(1) = lsq_qtmat_c_view(jc, 1, 0, jb) * z_d(0, jc, jk) +
-                            lsq_qtmat_c_view(jc, 1, 1, jb) * z_d(1, jc, jk) +
-                            lsq_qtmat_c_view(jc, 1, 2, jb) * z_d(2, jc, jk) +
-                            lsq_qtmat_c_view(jc, 1, 3, jb) * z_d(3, jc, jk) +
-                            lsq_qtmat_c_view(jc, 1, 4, jb) * z_d(4, jc, jk) +
-                            lsq_qtmat_c_view(jc, 1, 5, jb) * z_d(5, jc, jk) +
-                            lsq_qtmat_c_view(jc, 1, 6, jb) * z_d(6, jc, jk) +
-                            lsq_qtmat_c_view(jc, 1, 7, jb) * z_d(7, jc, jk) +
-                            lsq_qtmat_c_view(jc, 1, 8, jb) * z_d(8, jc, jk);
-          z_qt_times_d(2) = lsq_qtmat_c_view(jc, 2, 0, jb) * z_d(0, jc, jk) +
-                            lsq_qtmat_c_view(jc, 2, 1, jb) * z_d(1, jc, jk) +
-                            lsq_qtmat_c_view(jc, 2, 2, jb) * z_d(2, jc, jk) +
-                            lsq_qtmat_c_view(jc, 2, 3, jb) * z_d(3, jc, jk) +
-                            lsq_qtmat_c_view(jc, 2, 4, jb) * z_d(4, jc, jk) +
-                            lsq_qtmat_c_view(jc, 2, 5, jb) * z_d(5, jc, jk) +
-                            lsq_qtmat_c_view(jc, 2, 6, jb) * z_d(6, jc, jk) +
-                            lsq_qtmat_c_view(jc, 2, 7, jb) * z_d(7, jc, jk) +
-                            lsq_qtmat_c_view(jc, 2, 8, jb) * z_d(8, jc, jk);
-          z_qt_times_d(3) = lsq_qtmat_c_view(jc, 3, 0, jb) * z_d(0, jc, jk) +
-                            lsq_qtmat_c_view(jc, 3, 1, jb) * z_d(1, jc, jk) +
-                            lsq_qtmat_c_view(jc, 3, 2, jb) * z_d(2, jc, jk) +
-                            lsq_qtmat_c_view(jc, 3, 3, jb) * z_d(3, jc, jk) +
-                            lsq_qtmat_c_view(jc, 3, 4, jb) * z_d(4, jc, jk) +
-                            lsq_qtmat_c_view(jc, 3, 5, jb) * z_d(5, jc, jk) +
-                            lsq_qtmat_c_view(jc, 3, 6, jb) * z_d(6, jc, jk) +
-                            lsq_qtmat_c_view(jc, 3, 7, jb) * z_d(7, jc, jk) +
-                            lsq_qtmat_c_view(jc, 3, 8, jb) * z_d(8, jc, jk);
-          z_qt_times_d(4) = lsq_qtmat_c_view(jc, 4, 0, jb) * z_d(0, jc, jk) +
-                            lsq_qtmat_c_view(jc, 4, 1, jb) * z_d(1, jc, jk) +
-                            lsq_qtmat_c_view(jc, 4, 2, jb) * z_d(2, jc, jk) +
-                            lsq_qtmat_c_view(jc, 4, 3, jb) * z_d(3, jc, jk) +
-                            lsq_qtmat_c_view(jc, 4, 4, jb) * z_d(4, jc, jk) +
-                            lsq_qtmat_c_view(jc, 4, 5, jb) * z_d(5, jc, jk) +
-                            lsq_qtmat_c_view(jc, 4, 6, jb) * z_d(6, jc, jk) +
-                            lsq_qtmat_c_view(jc, 4, 7, jb) * z_d(7, jc, jk) +
-                            lsq_qtmat_c_view(jc, 4, 8, jb) * z_d(8, jc, jk);
-
-          p_coeff_view(5, jc, jk, jb) = ptr_rrdiag(jc, 4, jb) * z_qt_times_d(4);
+    //     });
+    // Kokkos::parallel_for(
+    //     "recon_lsq_cell_q_step2", innerPolicy,
+    //     KOKKOS_LAMBDA(const int jk, const int jc) {
+          z_qt_times_d[0] = lsq_qtmat_c_view(jc, 0, 0, jb) * z_d[0] +
+                            lsq_qtmat_c_view(jc, 0, 1, jb) * z_d[1] +
+                            lsq_qtmat_c_view(jc, 0, 2, jb) * z_d[2] +
+                            lsq_qtmat_c_view(jc, 0, 3, jb) * z_d[3] +
+                            lsq_qtmat_c_view(jc, 0, 4, jb) * z_d[4] +
+                            lsq_qtmat_c_view(jc, 0, 5, jb) * z_d[5] +
+                            lsq_qtmat_c_view(jc, 0, 6, jb) * z_d[6] +
+                            lsq_qtmat_c_view(jc, 0, 7, jb) * z_d[7] +
+                            lsq_qtmat_c_view(jc, 0, 8, jb) * z_d[8];
+          z_qt_times_d[1] = lsq_qtmat_c_view(jc, 1, 0, jb) * z_d[0] +
+                            lsq_qtmat_c_view(jc, 1, 1, jb) * z_d[1] +
+                            lsq_qtmat_c_view(jc, 1, 2, jb) * z_d[2] +
+                            lsq_qtmat_c_view(jc, 1, 3, jb) * z_d[3] +
+                            lsq_qtmat_c_view(jc, 1, 4, jb) * z_d[4] +
+                            lsq_qtmat_c_view(jc, 1, 5, jb) * z_d[5] +
+                            lsq_qtmat_c_view(jc, 1, 6, jb) * z_d[6] +
+                            lsq_qtmat_c_view(jc, 1, 7, jb) * z_d[7] +
+                            lsq_qtmat_c_view(jc, 1, 8, jb) * z_d[8];
+          z_qt_times_d[2] = lsq_qtmat_c_view(jc, 2, 0, jb) * z_d[0] +
+                            lsq_qtmat_c_view(jc, 2, 1, jb) * z_d[1] +
+                            lsq_qtmat_c_view(jc, 2, 2, jb) * z_d[2] +
+                            lsq_qtmat_c_view(jc, 2, 3, jb) * z_d[3] +
+                            lsq_qtmat_c_view(jc, 2, 4, jb) * z_d[4] +
+                            lsq_qtmat_c_view(jc, 2, 5, jb) * z_d[5] +
+                            lsq_qtmat_c_view(jc, 2, 6, jb) * z_d[6] +
+                            lsq_qtmat_c_view(jc, 2, 7, jb) * z_d[7] +
+                            lsq_qtmat_c_view(jc, 2, 8, jb) * z_d[8];
+          z_qt_times_d[3] = lsq_qtmat_c_view(jc, 3, 0, jb) * z_d[0] +
+                            lsq_qtmat_c_view(jc, 3, 1, jb) * z_d[1] +
+                            lsq_qtmat_c_view(jc, 3, 2, jb) * z_d[2] +
+                            lsq_qtmat_c_view(jc, 3, 3, jb) * z_d[3] +
+                            lsq_qtmat_c_view(jc, 3, 4, jb) * z_d[4] +
+                            lsq_qtmat_c_view(jc, 3, 5, jb) * z_d[5] +
+                            lsq_qtmat_c_view(jc, 3, 6, jb) * z_d[6] +
+                            lsq_qtmat_c_view(jc, 3, 7, jb) * z_d[7] +
+                            lsq_qtmat_c_view(jc, 3, 8, jb) * z_d[8];
+          z_qt_times_d[4] = lsq_qtmat_c_view(jc, 4, 0, jb) * z_d[0] +
+                            lsq_qtmat_c_view(jc, 4, 1, jb) * z_d[1] +
+                            lsq_qtmat_c_view(jc, 4, 2, jb) * z_d[2] +
+                            lsq_qtmat_c_view(jc, 4, 3, jb) * z_d[3] +
+                            lsq_qtmat_c_view(jc, 4, 4, jb) * z_d[4] +
+                            lsq_qtmat_c_view(jc, 4, 5, jb) * z_d[5] +
+                            lsq_qtmat_c_view(jc, 4, 6, jb) * z_d[6] +
+                            lsq_qtmat_c_view(jc, 4, 7, jb) * z_d[7] +
+                            lsq_qtmat_c_view(jc, 4, 8, jb) * z_d[8];
+
+          p_coeff_view(5, jc, jk, jb) = ptr_rrdiag(jc, 4, jb) * z_qt_times_d[4];
           p_coeff_view(4, jc, jk, jb) =
               ptr_rrdiag(jc, 3, jb) *
-              (z_qt_times_d(3) -
+              (z_qt_times_d[3] -
                ptr_rutri(jc, 0, jb) * p_coeff_view(5, jc, jk, jb));
           p_coeff_view(3, jc, jk, jb) =
               ptr_rrdiag(jc, 2, jb) *
-              (z_qt_times_d(2) -
+              (z_qt_times_d[2] -
                ptr_rutri(jc, 1, jb) * p_coeff_view(4, jc, jk, jb) -
                ptr_rutri(jc, 2, jb) * p_coeff_view(5, jc, jk, jb));
           p_coeff_view(2, jc, jk, jb) =
               ptr_rrdiag(jc, 1, jb) *
-              (z_qt_times_d(1) -
+              (z_qt_times_d[1] -
                ptr_rutri(jc, 3, jb) * p_coeff_view(3, jc, jk, jb) -
                ptr_rutri(jc, 4, jb) * p_coeff_view(4, jc, jk, jb) -
                ptr_rutri(jc, 5, jb) * p_coeff_view(5, jc, jk, jb));
           p_coeff_view(1, jc, jk, jb) =
               ptr_rrdiag(jc, 0, jb) *
-              (z_qt_times_d(0) -
+              (z_qt_times_d[0] -
                ptr_rutri(jc, 6, jb) * p_coeff_view(2, jc, jk, jb) -
                ptr_rutri(jc, 7, jb) * p_coeff_view(3, jc, jk, jb) -
                ptr_rutri(jc, 8, jb) * p_coeff_view(4, jc, jk, jb) -
@@ -398,78 +399,79 @@ void recon_lsq_cell_q_svd(const T *p_cc, const int *lsq_idx_c,
     Kokkos::parallel_for(
         "recon_lsq_cell_q_svd_step1", innerPolicy,
         KOKKOS_LAMBDA(const int jk, const int jc) {
-          z_b(0, jc, jk) = p_cc_view(iidx(jc, jb, 0), jk, iblk(jc, jb, 0)) -
+          T z_b[9]; // Local array instead of shared View
+          z_b[0] = p_cc_view(iidx(jc, jb, 0), jk, iblk(jc, jb, 0)) -
                            p_cc_view(jc, jk, jb);
-          z_b(1, jc, jk) = p_cc_view(iidx(jc, jb, 1), jk, iblk(jc, jb, 1)) -
+          z_b[1] = p_cc_view(iidx(jc, jb, 1), jk, iblk(jc, jb, 1)) -
                            p_cc_view(jc, jk, jb);
-          z_b(2, jc, jk) = p_cc_view(iidx(jc, jb, 2), jk, iblk(jc, jb, 2)) -
+          z_b[2] = p_cc_view(iidx(jc, jb, 2), jk, iblk(jc, jb, 2)) -
                            p_cc_view(jc, jk, jb);
-          z_b(3, jc, jk) = p_cc_view(iidx(jc, jb, 3), jk, iblk(jc, jb, 3)) -
+          z_b[3] = p_cc_view(iidx(jc, jb, 3), jk, iblk(jc, jb, 3)) -
                            p_cc_view(jc, jk, jb);
-          z_b(4, jc, jk) = p_cc_view(iidx(jc, jb, 4), jk, iblk(jc, jb, 4)) -
+          z_b[4] = p_cc_view(iidx(jc, jb, 4), jk, iblk(jc, jb, 4)) -
                            p_cc_view(jc, jk, jb);
-          z_b(5, jc, jk) = p_cc_view(iidx(jc, jb, 5), jk, iblk(jc, jb, 5)) -
+          z_b[5] = p_cc_view(iidx(jc, jb, 5), jk, iblk(jc, jb, 5)) -
                            p_cc_view(jc, jk, jb);
-          z_b(6, jc, jk) = p_cc_view(iidx(jc, jb, 6), jk, iblk(jc, jb, 6)) -
+          z_b[6] = p_cc_view(iidx(jc, jb, 6), jk, iblk(jc, jb, 6)) -
                            p_cc_view(jc, jk, jb);
-          z_b(7, jc, jk) = p_cc_view(iidx(jc, jb, 7), jk, iblk(jc, jb, 7)) -
+          z_b[7] = p_cc_view(iidx(jc, jb, 7), jk, iblk(jc, jb, 7)) -
                            p_cc_view(jc, jk, jb);
-          z_b(8, jc, jk) = p_cc_view(iidx(jc, jb, 8), jk, iblk(jc, jb, 8)) -
+          z_b[8] = p_cc_view(iidx(jc, jb, 8), jk, iblk(jc, jb, 8)) -
                            p_cc_view(jc, jk, jb);
-        });
-    Kokkos::parallel_for(
-        "recon_lsq_cell_q_svd_step2", innerPolicy,
-        KOKKOS_LAMBDA(const int jk, const int jc) {
+        // });
+    // Kokkos::parallel_for(
+    //     "recon_lsq_cell_q_svd_step2", innerPolicy,
+    //     KOKKOS_LAMBDA(const int jk, const int jc) {
           p_coeff_view(5, jc, jk, jb) =
-              lsq_pseudoinv_view(jc, 4, 0, jb) * z_b(0, jc, jk) +
-              lsq_pseudoinv_view(jc, 4, 1, jb) * z_b(1, jc, jk) +
-              lsq_pseudoinv_view(jc, 4, 2, jb) * z_b(2, jc, jk) +
-              lsq_pseudoinv_view(jc, 4, 3, jb) * z_b(3, jc, jk) +
-              lsq_pseudoinv_view(jc, 4, 4, jb) * z_b(4, jc, jk) +
-              lsq_pseudoinv_view(jc, 4, 5, jb) * z_b(5, jc, jk) +
-              lsq_pseudoinv_view(jc, 4, 6, jb) * z_b(6, jc, jk) +
-              lsq_pseudoinv_view(jc, 4, 7, jb) * z_b(7, jc, jk) +
-              lsq_pseudoinv_view(jc, 4, 8, jb) * z_b(8, jc, jk);
+              lsq_pseudoinv_view(jc, 4, 0, jb) * z_b[0] +
+              lsq_pseudoinv_view(jc, 4, 1, jb) * z_b[1] +
+              lsq_pseudoinv_view(jc, 4, 2, jb) * z_b[2] +
+              lsq_pseudoinv_view(jc, 4, 3, jb) * z_b[3] +
+              lsq_pseudoinv_view(jc, 4, 4, jb) * z_b[4] +
+              lsq_pseudoinv_view(jc, 4, 5, jb) * z_b[5] +
+              lsq_pseudoinv_view(jc, 4, 6, jb) * z_b[6] +
+              lsq_pseudoinv_view(jc, 4, 7, jb) * z_b[7] +
+              lsq_pseudoinv_view(jc, 4, 8, jb) * z_b[8];
           p_coeff_view(4, jc, jk, jb) =
-              lsq_pseudoinv_view(jc, 3, 0, jb) * z_b(0, jc, jk) +
-              lsq_pseudoinv_view(jc, 3, 1, jb) * z_b(1, jc, jk) +
-              lsq_pseudoinv_view(jc, 3, 2, jb) * z_b(2, jc, jk) +
-              lsq_pseudoinv_view(jc, 3, 3, jb) * z_b(3, jc, jk) +
-              lsq_pseudoinv_view(jc, 3, 4, jb) * z_b(4, jc, jk) +
-              lsq_pseudoinv_view(jc, 3, 5, jb) * z_b(5, jc, jk) +
-              lsq_pseudoinv_view(jc, 3, 6, jb) * z_b(6, jc, jk) +
-              lsq_pseudoinv_view(jc, 3, 7, jb) * z_b(7, jc, jk) +
-              lsq_pseudoinv_view(jc, 3, 8, jb) * z_b(8, jc, jk);
+              lsq_pseudoinv_view(jc, 3, 0, jb) * z_b[0] +
+              lsq_pseudoinv_view(jc, 3, 1, jb) * z_b[1] +
+              lsq_pseudoinv_view(jc, 3, 2, jb) * z_b[2] +
+              lsq_pseudoinv_view(jc, 3, 3, jb) * z_b[3] +
+              lsq_pseudoinv_view(jc, 3, 4, jb) * z_b[4] +
+              lsq_pseudoinv_view(jc, 3, 5, jb) * z_b[5] +
+              lsq_pseudoinv_view(jc, 3, 6, jb) * z_b[6] +
+              lsq_pseudoinv_view(jc, 3, 7, jb) * z_b[7] +
+              lsq_pseudoinv_view(jc, 3, 8, jb) * z_b[8];
           p_coeff_view(3, jc, jk, jb) =
-              lsq_pseudoinv_view(jc, 2, 0, jb) * z_b(0, jc, jk) +
-              lsq_pseudoinv_view(jc, 2, 1, jb) * z_b(1, jc, jk) +
-              lsq_pseudoinv_view(jc, 2, 2, jb) * z_b(2, jc, jk) +
-              lsq_pseudoinv_view(jc, 2, 3, jb) * z_b(3, jc, jk) +
-              lsq_pseudoinv_view(jc, 2, 4, jb) * z_b(4, jc, jk) +
-              lsq_pseudoinv_view(jc, 2, 5, jb) * z_b(5, jc, jk) +
-              lsq_pseudoinv_view(jc, 2, 6, jb) * z_b(6, jc, jk) +
-              lsq_pseudoinv_view(jc, 2, 7, jb) * z_b(7, jc, jk) +
-              lsq_pseudoinv_view(jc, 2, 8, jb) * z_b(8, jc, jk);
+              lsq_pseudoinv_view(jc, 2, 0, jb) * z_b[0] +
+              lsq_pseudoinv_view(jc, 2, 1, jb) * z_b[1] +
+              lsq_pseudoinv_view(jc, 2, 2, jb) * z_b[2] +
+              lsq_pseudoinv_view(jc, 2, 3, jb) * z_b[3] +
+              lsq_pseudoinv_view(jc, 2, 4, jb) * z_b[4] +
+              lsq_pseudoinv_view(jc, 2, 5, jb) * z_b[5] +
+              lsq_pseudoinv_view(jc, 2, 6, jb) * z_b[6] +
+              lsq_pseudoinv_view(jc, 2, 7, jb) * z_b[7] +
+              lsq_pseudoinv_view(jc, 2, 8, jb) * z_b[8];
           p_coeff_view(2, jc, jk, jb) =
-              lsq_pseudoinv_view(jc, 1, 0, jb) * z_b(0, jc, jk) +
-              lsq_pseudoinv_view(jc, 1, 1, jb) * z_b(1, jc, jk) +
-              lsq_pseudoinv_view(jc, 1, 2, jb) * z_b(2, jc, jk) +
-              lsq_pseudoinv_view(jc, 1, 3, jb) * z_b(3, jc, jk) +
-              lsq_pseudoinv_view(jc, 1, 4, jb) * z_b(4, jc, jk) +
-              lsq_pseudoinv_view(jc, 1, 5, jb) * z_b(5, jc, jk) +
-              lsq_pseudoinv_view(jc, 1, 6, jb) * z_b(6, jc, jk) +
-              lsq_pseudoinv_view(jc, 1, 7, jb) * z_b(7, jc, jk) +
-              lsq_pseudoinv_view(jc, 1, 8, jb) * z_b(8, jc, jk);
+              lsq_pseudoinv_view(jc, 1, 0, jb) * z_b[0] +
+              lsq_pseudoinv_view(jc, 1, 1, jb) * z_b[1] +
+              lsq_pseudoinv_view(jc, 1, 2, jb) * z_b[2] +
+              lsq_pseudoinv_view(jc, 1, 3, jb) * z_b[3] +
+              lsq_pseudoinv_view(jc, 1, 4, jb) * z_b[4] +
+              lsq_pseudoinv_view(jc, 1, 5, jb) * z_b[5] +
+              lsq_pseudoinv_view(jc, 1, 6, jb) * z_b[6] +
+              lsq_pseudoinv_view(jc, 1, 7, jb) * z_b[7] +
+              lsq_pseudoinv_view(jc, 1, 8, jb) * z_b[8];
           p_coeff_view(1, jc, jk, jb) =
-              lsq_pseudoinv_view(jc, 0, 0, jb) * z_b(0, jc, jk) +
-              lsq_pseudoinv_view(jc, 0, 1, jb) * z_b(1, jc, jk) +
-              lsq_pseudoinv_view(jc, 0, 2, jb) * z_b(2, jc, jk) +
-              lsq_pseudoinv_view(jc, 0, 3, jb) * z_b(3, jc, jk) +
-              lsq_pseudoinv_view(jc, 0, 4, jb) * z_b(4, jc, jk) +
-              lsq_pseudoinv_view(jc, 0, 5, jb) * z_b(5, jc, jk) +
-              lsq_pseudoinv_view(jc, 0, 6, jb) * z_b(6, jc, jk) +
-              lsq_pseudoinv_view(jc, 0, 7, jb) * z_b(7, jc, jk) +
-              lsq_pseudoinv_view(jc, 0, 8, jb) * z_b(8, jc, jk);
+              lsq_pseudoinv_view(jc, 0, 0, jb) * z_b[0] +
+              lsq_pseudoinv_view(jc, 0, 1, jb) * z_b[1] +
+              lsq_pseudoinv_view(jc, 0, 2, jb) * z_b[2] +
+              lsq_pseudoinv_view(jc, 0, 3, jb) * z_b[3] +
+              lsq_pseudoinv_view(jc, 0, 4, jb) * z_b[4] +
+              lsq_pseudoinv_view(jc, 0, 5, jb) * z_b[5] +
+              lsq_pseudoinv_view(jc, 0, 6, jb) * z_b[6] +
+              lsq_pseudoinv_view(jc, 0, 7, jb) * z_b[7] +
+              lsq_pseudoinv_view(jc, 0, 8, jb) * z_b[8];
           p_coeff_view(0, jc, jk, jb) =
               p_cc_view(jc, jk, jb) -
               p_coeff_view(1, jc, jk, jb) * lsq_moments_view(jc, jb, 0) -
@@ -505,8 +507,8 @@ void recon_lsq_cell_c(const T *p_cc, const int *lsq_idx_c, const int *lsq_blk_c,
                        Kokkos::MemoryUnmanaged>
       UnmanagedConstInt3D;
 
-  Kokkos::View<T ***> z_d("z_d", lsq_dim_c, nproma, elev);
-  Kokkos::View<T *> z_qt_times_d("z_qt_times_d", 9);
+  // Kokkos::View<T ***> z_d("z_d", lsq_dim_c, nproma, elev);
+  // Kokkos::View<T *> z_qt_times_d("z_qt_times_d", 9);
 
   UnmanagedConstInt3D iidx(lsq_idx_c, nproma, nblks_c, lsq_dim_c);
   UnmanagedConstInt3D iblk(lsq_blk_c, nproma, nblks_c, lsq_dim_c);
@@ -543,136 +545,139 @@ void recon_lsq_cell_c(const T *p_cc, const int *lsq_idx_c, const int *lsq_blk_c,
     Kokkos::parallel_for(
         "recon_lsq_cell_c_step1", innerPolicy,
         KOKKOS_LAMBDA(const int jk, const int jc) {
-          z_d(0, jc, jk) = p_cc_view(iidx(jc, jb, 0), jk, iblk(jc, jb, 0)) -
+          T z_d[9]; // Local array instead of shared View
+          T z_qt_times_d[9]; // Local array instead of shared View
+
+          z_d[0] = p_cc_view(iidx(jc, jb, 0), jk, iblk(jc, jb, 0)) -
                            p_cc_view(jc, jk, jb);
-          z_d(1, jc, jk) = p_cc_view(iidx(jc, jb, 1), jk, iblk(jc, jb, 1)) -
+          z_d[1] = p_cc_view(iidx(jc, jb, 1), jk, iblk(jc, jb, 1)) -
                            p_cc_view(jc, jk, jb);
-          z_d(2, jc, jk) = p_cc_view(iidx(jc, jb, 2), jk, iblk(jc, jb, 2)) -
+          z_d[2] = p_cc_view(iidx(jc, jb, 2), jk, iblk(jc, jb, 2)) -
                            p_cc_view(jc, jk, jb);
-          z_d(3, jc, jk) = p_cc_view(iidx(jc, jb, 3), jk, iblk(jc, jb, 3)) -
+          z_d[3] = p_cc_view(iidx(jc, jb, 3), jk, iblk(jc, jb, 3)) -
                            p_cc_view(jc, jk, jb);
-          z_d(4, jc, jk) = p_cc_view(iidx(jc, jb, 4), jk, iblk(jc, jb, 4)) -
+          z_d[4] = p_cc_view(iidx(jc, jb, 4), jk, iblk(jc, jb, 4)) -
                            p_cc_view(jc, jk, jb);
-          z_d(5, jc, jk) = p_cc_view(iidx(jc, jb, 5), jk, iblk(jc, jb, 5)) -
+          z_d[5] = p_cc_view(iidx(jc, jb, 5), jk, iblk(jc, jb, 5)) -
                            p_cc_view(jc, jk, jb);
-          z_d(6, jc, jk) = p_cc_view(iidx(jc, jb, 6), jk, iblk(jc, jb, 6)) -
+          z_d[6] = p_cc_view(iidx(jc, jb, 6), jk, iblk(jc, jb, 6)) -
                            p_cc_view(jc, jk, jb);
-          z_d(7, jc, jk) = p_cc_view(iidx(jc, jb, 7), jk, iblk(jc, jb, 7)) -
+          z_d[7] = p_cc_view(iidx(jc, jb, 7), jk, iblk(jc, jb, 7)) -
                            p_cc_view(jc, jk, jb);
-          z_d(8, jc, jk) = p_cc_view(iidx(jc, jb, 8), jk, iblk(jc, jb, 8)) -
+          z_d[8] = p_cc_view(iidx(jc, jb, 8), jk, iblk(jc, jb, 8)) -
                            p_cc_view(jc, jk, jb);
-        });
-    Kokkos::parallel_for(
-        "recon_lsq_cell_c_step2", innerPolicy,
-        KOKKOS_LAMBDA(const int jk, const int jc) {
-          z_qt_times_d(0) = lsq_qtmat_c_view(jc, 0, 0, jb) * z_d(0, jc, jk) +
-                            lsq_qtmat_c_view(jc, 0, 1, jb) * z_d(1, jc, jk) +
-                            lsq_qtmat_c_view(jc, 0, 2, jb) * z_d(2, jc, jk) +
-                            lsq_qtmat_c_view(jc, 0, 3, jb) * z_d(3, jc, jk) +
-                            lsq_qtmat_c_view(jc, 0, 4, jb) * z_d(4, jc, jk) +
-                            lsq_qtmat_c_view(jc, 0, 5, jb) * z_d(5, jc, jk) +
-                            lsq_qtmat_c_view(jc, 0, 6, jb) * z_d(6, jc, jk) +
-                            lsq_qtmat_c_view(jc, 0, 7, jb) * z_d(7, jc, jk) +
-                            lsq_qtmat_c_view(jc, 0, 8, jb) * z_d(8, jc, jk);
-          z_qt_times_d(1) = lsq_qtmat_c_view(jc, 1, 0, jb) * z_d(0, jc, jk) +
-                            lsq_qtmat_c_view(jc, 1, 1, jb) * z_d(1, jc, jk) +
-                            lsq_qtmat_c_view(jc, 1, 2, jb) * z_d(2, jc, jk) +
-                            lsq_qtmat_c_view(jc, 1, 3, jb) * z_d(3, jc, jk) +
-                            lsq_qtmat_c_view(jc, 1, 4, jb) * z_d(4, jc, jk) +
-                            lsq_qtmat_c_view(jc, 1, 5, jb) * z_d(5, jc, jk) +
-                            lsq_qtmat_c_view(jc, 1, 6, jb) * z_d(6, jc, jk) +
-                            lsq_qtmat_c_view(jc, 1, 7, jb) * z_d(7, jc, jk) +
-                            lsq_qtmat_c_view(jc, 1, 8, jb) * z_d(8, jc, jk);
-          z_qt_times_d(2) = lsq_qtmat_c_view(jc, 2, 0, jb) * z_d(0, jc, jk) +
-                            lsq_qtmat_c_view(jc, 2, 1, jb) * z_d(1, jc, jk) +
-                            lsq_qtmat_c_view(jc, 2, 2, jb) * z_d(2, jc, jk) +
-                            lsq_qtmat_c_view(jc, 2, 3, jb) * z_d(3, jc, jk) +
-                            lsq_qtmat_c_view(jc, 2, 4, jb) * z_d(4, jc, jk) +
-                            lsq_qtmat_c_view(jc, 2, 5, jb) * z_d(5, jc, jk) +
-                            lsq_qtmat_c_view(jc, 2, 6, jb) * z_d(6, jc, jk) +
-                            lsq_qtmat_c_view(jc, 2, 7, jb) * z_d(7, jc, jk) +
-                            lsq_qtmat_c_view(jc, 2, 8, jb) * z_d(8, jc, jk);
-          z_qt_times_d(3) = lsq_qtmat_c_view(jc, 3, 0, jb) * z_d(0, jc, jk) +
-                            lsq_qtmat_c_view(jc, 3, 1, jb) * z_d(1, jc, jk) +
-                            lsq_qtmat_c_view(jc, 3, 2, jb) * z_d(2, jc, jk) +
-                            lsq_qtmat_c_view(jc, 3, 3, jb) * z_d(3, jc, jk) +
-                            lsq_qtmat_c_view(jc, 3, 4, jb) * z_d(4, jc, jk) +
-                            lsq_qtmat_c_view(jc, 3, 5, jb) * z_d(5, jc, jk) +
-                            lsq_qtmat_c_view(jc, 3, 6, jb) * z_d(6, jc, jk) +
-                            lsq_qtmat_c_view(jc, 3, 7, jb) * z_d(7, jc, jk) +
-                            lsq_qtmat_c_view(jc, 3, 8, jb) * z_d(8, jc, jk);
-          z_qt_times_d(4) = lsq_qtmat_c_view(jc, 4, 0, jb) * z_d(0, jc, jk) +
-                            lsq_qtmat_c_view(jc, 4, 1, jb) * z_d(1, jc, jk) +
-                            lsq_qtmat_c_view(jc, 4, 2, jb) * z_d(2, jc, jk) +
-                            lsq_qtmat_c_view(jc, 4, 3, jb) * z_d(3, jc, jk) +
-                            lsq_qtmat_c_view(jc, 4, 4, jb) * z_d(4, jc, jk) +
-                            lsq_qtmat_c_view(jc, 4, 5, jb) * z_d(5, jc, jk) +
-                            lsq_qtmat_c_view(jc, 4, 6, jb) * z_d(6, jc, jk) +
-                            lsq_qtmat_c_view(jc, 4, 7, jb) * z_d(7, jc, jk) +
-                            lsq_qtmat_c_view(jc, 4, 8, jb) * z_d(8, jc, jk);
-          z_qt_times_d(5) = lsq_qtmat_c_view(jc, 5, 0, jb) * z_d(0, jc, jk) +
-                            lsq_qtmat_c_view(jc, 5, 1, jb) * z_d(1, jc, jk) +
-                            lsq_qtmat_c_view(jc, 5, 2, jb) * z_d(2, jc, jk) +
-                            lsq_qtmat_c_view(jc, 5, 3, jb) * z_d(3, jc, jk) +
-                            lsq_qtmat_c_view(jc, 5, 4, jb) * z_d(4, jc, jk) +
-                            lsq_qtmat_c_view(jc, 5, 5, jb) * z_d(5, jc, jk) +
-                            lsq_qtmat_c_view(jc, 5, 6, jb) * z_d(6, jc, jk) +
-                            lsq_qtmat_c_view(jc, 5, 7, jb) * z_d(7, jc, jk) +
-                            lsq_qtmat_c_view(jc, 5, 8, jb) * z_d(8, jc, jk);
-          z_qt_times_d(6) = lsq_qtmat_c_view(jc, 6, 0, jb) * z_d(0, jc, jk) +
-                            lsq_qtmat_c_view(jc, 6, 1, jb) * z_d(1, jc, jk) +
-                            lsq_qtmat_c_view(jc, 6, 2, jb) * z_d(2, jc, jk) +
-                            lsq_qtmat_c_view(jc, 6, 3, jb) * z_d(3, jc, jk) +
-                            lsq_qtmat_c_view(jc, 6, 4, jb) * z_d(4, jc, jk) +
-                            lsq_qtmat_c_view(jc, 6, 5, jb) * z_d(5, jc, jk) +
-                            lsq_qtmat_c_view(jc, 6, 6, jb) * z_d(6, jc, jk) +
-                            lsq_qtmat_c_view(jc, 6, 7, jb) * z_d(7, jc, jk) +
-                            lsq_qtmat_c_view(jc, 6, 8, jb) * z_d(8, jc, jk);
-          z_qt_times_d(7) = lsq_qtmat_c_view(jc, 7, 0, jb) * z_d(0, jc, jk) +
-                            lsq_qtmat_c_view(jc, 7, 1, jb) * z_d(1, jc, jk) +
-                            lsq_qtmat_c_view(jc, 7, 2, jb) * z_d(2, jc, jk) +
-                            lsq_qtmat_c_view(jc, 7, 3, jb) * z_d(3, jc, jk) +
-                            lsq_qtmat_c_view(jc, 7, 4, jb) * z_d(4, jc, jk) +
-                            lsq_qtmat_c_view(jc, 7, 5, jb) * z_d(5, jc, jk) +
-                            lsq_qtmat_c_view(jc, 7, 6, jb) * z_d(6, jc, jk) +
-                            lsq_qtmat_c_view(jc, 7, 7, jb) * z_d(7, jc, jk) +
-                            lsq_qtmat_c_view(jc, 7, 8, jb) * z_d(8, jc, jk);
-          z_qt_times_d(8) = lsq_qtmat_c_view(jc, 8, 0, jb) * z_d(0, jc, jk) +
-                            lsq_qtmat_c_view(jc, 8, 1, jb) * z_d(1, jc, jk) +
-                            lsq_qtmat_c_view(jc, 8, 2, jb) * z_d(2, jc, jk) +
-                            lsq_qtmat_c_view(jc, 8, 3, jb) * z_d(3, jc, jk) +
-                            lsq_qtmat_c_view(jc, 8, 4, jb) * z_d(4, jc, jk) +
-                            lsq_qtmat_c_view(jc, 8, 5, jb) * z_d(5, jc, jk) +
-                            lsq_qtmat_c_view(jc, 8, 6, jb) * z_d(6, jc, jk) +
-                            lsq_qtmat_c_view(jc, 8, 7, jb) * z_d(7, jc, jk) +
-                            lsq_qtmat_c_view(jc, 8, 8, jb) * z_d(8, jc, jk);
-
-          p_coeff_view(9, jc, jk, jb) = ptr_rrdiag(jc, 8, jb) * z_qt_times_d(8);
+    //     });
+    // Kokkos::parallel_for(
+    //     "recon_lsq_cell_c_step2", innerPolicy,
+    //     KOKKOS_LAMBDA(const int jk, const int jc) {
+          z_qt_times_d[0] = lsq_qtmat_c_view(jc, 0, 0, jb) * z_d[0] +
+                            lsq_qtmat_c_view(jc, 0, 1, jb) * z_d[1] +
+                            lsq_qtmat_c_view(jc, 0, 2, jb) * z_d[2] +
+                            lsq_qtmat_c_view(jc, 0, 3, jb) * z_d[3] +
+                            lsq_qtmat_c_view(jc, 0, 4, jb) * z_d[4] +
+                            lsq_qtmat_c_view(jc, 0, 5, jb) * z_d[5] +
+                            lsq_qtmat_c_view(jc, 0, 6, jb) * z_d[6] +
+                            lsq_qtmat_c_view(jc, 0, 7, jb) * z_d[7] +
+                            lsq_qtmat_c_view(jc, 0, 8, jb) * z_d[8];
+          z_qt_times_d[1] = lsq_qtmat_c_view(jc, 1, 0, jb) * z_d[0] +
+                            lsq_qtmat_c_view(jc, 1, 1, jb) * z_d[1] +
+                            lsq_qtmat_c_view(jc, 1, 2, jb) * z_d[2] +
+                            lsq_qtmat_c_view(jc, 1, 3, jb) * z_d[3] +
+                            lsq_qtmat_c_view(jc, 1, 4, jb) * z_d[4] +
+                            lsq_qtmat_c_view(jc, 1, 5, jb) * z_d[5] +
+                            lsq_qtmat_c_view(jc, 1, 6, jb) * z_d[6] +
+                            lsq_qtmat_c_view(jc, 1, 7, jb) * z_d[7] +
+                            lsq_qtmat_c_view(jc, 1, 8, jb) * z_d[8];
+          z_qt_times_d[2] = lsq_qtmat_c_view(jc, 2, 0, jb) * z_d[0] +
+                            lsq_qtmat_c_view(jc, 2, 1, jb) * z_d[1] +
+                            lsq_qtmat_c_view(jc, 2, 2, jb) * z_d[2] +
+                            lsq_qtmat_c_view(jc, 2, 3, jb) * z_d[3] +
+                            lsq_qtmat_c_view(jc, 2, 4, jb) * z_d[4] +
+                            lsq_qtmat_c_view(jc, 2, 5, jb) * z_d[5] +
+                            lsq_qtmat_c_view(jc, 2, 6, jb) * z_d[6] +
+                            lsq_qtmat_c_view(jc, 2, 7, jb) * z_d[7] +
+                            lsq_qtmat_c_view(jc, 2, 8, jb) * z_d[8];
+          z_qt_times_d[3] = lsq_qtmat_c_view(jc, 3, 0, jb) * z_d[0] +
+                            lsq_qtmat_c_view(jc, 3, 1, jb) * z_d[1] +
+                            lsq_qtmat_c_view(jc, 3, 2, jb) * z_d[2] +
+                            lsq_qtmat_c_view(jc, 3, 3, jb) * z_d[3] +
+                            lsq_qtmat_c_view(jc, 3, 4, jb) * z_d[4] +
+                            lsq_qtmat_c_view(jc, 3, 5, jb) * z_d[5] +
+                            lsq_qtmat_c_view(jc, 3, 6, jb) * z_d[6] +
+                            lsq_qtmat_c_view(jc, 3, 7, jb) * z_d[7] +
+                            lsq_qtmat_c_view(jc, 3, 8, jb) * z_d[8];
+          z_qt_times_d[4] = lsq_qtmat_c_view(jc, 4, 0, jb) * z_d[0] +
+                            lsq_qtmat_c_view(jc, 4, 1, jb) * z_d[1] +
+                            lsq_qtmat_c_view(jc, 4, 2, jb) * z_d[2] +
+                            lsq_qtmat_c_view(jc, 4, 3, jb) * z_d[3] +
+                            lsq_qtmat_c_view(jc, 4, 4, jb) * z_d[4] +
+                            lsq_qtmat_c_view(jc, 4, 5, jb) * z_d[5] +
+                            lsq_qtmat_c_view(jc, 4, 6, jb) * z_d[6] +
+                            lsq_qtmat_c_view(jc, 4, 7, jb) * z_d[7] +
+                            lsq_qtmat_c_view(jc, 4, 8, jb) * z_d[8];
+          z_qt_times_d[5] = lsq_qtmat_c_view(jc, 5, 0, jb) * z_d[0] +
+                            lsq_qtmat_c_view(jc, 5, 1, jb) * z_d[1] +
+                            lsq_qtmat_c_view(jc, 5, 2, jb) * z_d[2] +
+                            lsq_qtmat_c_view(jc, 5, 3, jb) * z_d[3] +
+                            lsq_qtmat_c_view(jc, 5, 4, jb) * z_d[4] +
+                            lsq_qtmat_c_view(jc, 5, 5, jb) * z_d[5] +
+                            lsq_qtmat_c_view(jc, 5, 6, jb) * z_d[6] +
+                            lsq_qtmat_c_view(jc, 5, 7, jb) * z_d[7] +
+                            lsq_qtmat_c_view(jc, 5, 8, jb) * z_d[8];
+          z_qt_times_d[6] = lsq_qtmat_c_view(jc, 6, 0, jb) * z_d[0] +
+                            lsq_qtmat_c_view(jc, 6, 1, jb) * z_d[1] +
+                            lsq_qtmat_c_view(jc, 6, 2, jb) * z_d[2] +
+                            lsq_qtmat_c_view(jc, 6, 3, jb) * z_d[3] +
+                            lsq_qtmat_c_view(jc, 6, 4, jb) * z_d[4] +
+                            lsq_qtmat_c_view(jc, 6, 5, jb) * z_d[5] +
+                            lsq_qtmat_c_view(jc, 6, 6, jb) * z_d[6] +
+                            lsq_qtmat_c_view(jc, 6, 7, jb) * z_d[7] +
+                            lsq_qtmat_c_view(jc, 6, 8, jb) * z_d[8];
+          z_qt_times_d[7] = lsq_qtmat_c_view(jc, 7, 0, jb) * z_d[0] +
+                            lsq_qtmat_c_view(jc, 7, 1, jb) * z_d[1] +
+                            lsq_qtmat_c_view(jc, 7, 2, jb) * z_d[2] +
+                            lsq_qtmat_c_view(jc, 7, 3, jb) * z_d[3] +
+                            lsq_qtmat_c_view(jc, 7, 4, jb) * z_d[4] +
+                            lsq_qtmat_c_view(jc, 7, 5, jb) * z_d[5] +
+                            lsq_qtmat_c_view(jc, 7, 6, jb) * z_d[6] +
+                            lsq_qtmat_c_view(jc, 7, 7, jb) * z_d[7] +
+                            lsq_qtmat_c_view(jc, 7, 8, jb) * z_d[8];
+          z_qt_times_d[8] = lsq_qtmat_c_view(jc, 8, 0, jb) * z_d[0] +
+                            lsq_qtmat_c_view(jc, 8, 1, jb) * z_d[1] +
+                            lsq_qtmat_c_view(jc, 8, 2, jb) * z_d[2] +
+                            lsq_qtmat_c_view(jc, 8, 3, jb) * z_d[3] +
+                            lsq_qtmat_c_view(jc, 8, 4, jb) * z_d[4] +
+                            lsq_qtmat_c_view(jc, 8, 5, jb) * z_d[5] +
+                            lsq_qtmat_c_view(jc, 8, 6, jb) * z_d[6] +
+                            lsq_qtmat_c_view(jc, 8, 7, jb) * z_d[7] +
+                            lsq_qtmat_c_view(jc, 8, 8, jb) * z_d[8];
+
+          p_coeff_view(9, jc, jk, jb) = ptr_rrdiag(jc, 8, jb) * z_qt_times_d[8];
           p_coeff_view(8, jc, jk, jb) =
               ptr_rrdiag(jc, 7, jb) *
-              (z_qt_times_d(7) -
+              (z_qt_times_d[7] -
                ptr_rutri(jc, 0, jb) * p_coeff_view(9, jc, jk, jb));
           p_coeff_view(7, jc, jk, jb) =
               ptr_rrdiag(jc, 6, jb) *
-              (z_qt_times_d(6) -
+              (z_qt_times_d[6] -
                (ptr_rutri(jc, 1, jb) * p_coeff_view(8, jc, jk, jb) +
                 ptr_rutri(jc, 2, jb) * p_coeff_view(9, jc, jk, jb)));
           p_coeff_view(6, jc, jk, jb) =
               ptr_rrdiag(jc, 5, jb) *
-              (z_qt_times_d(5) -
+              (z_qt_times_d[5] -
                (ptr_rutri(jc, 3, jb) * p_coeff_view(7, jc, jk, jb) +
                 ptr_rutri(jc, 4, jb) * p_coeff_view(8, jc, jk, jb) +
                 ptr_rutri(jc, 5, jb) * p_coeff_view(9, jc, jk, jb)));
           p_coeff_view(5, jc, jk, jb) =
               ptr_rrdiag(jc, 4, jb) *
-              (z_qt_times_d(4) -
+              (z_qt_times_d[4] -
                (ptr_rutri(jc, 6, jb) * p_coeff_view(6, jc, jk, jb) +
                 ptr_rutri(jc, 7, jb) * p_coeff_view(7, jc, jk, jb) +
                 ptr_rutri(jc, 8, jb) * p_coeff_view(8, jc, jk, jb) +
                 ptr_rutri(jc, 9, jb) * p_coeff_view(9, jc, jk, jb)));
           p_coeff_view(4, jc, jk, jb) =
               ptr_rrdiag(jc, 3, jb) *
-              (z_qt_times_d(3) -
+              (z_qt_times_d[3] -
                (ptr_rutri(jc, 10, jb) * p_coeff_view(5, jc, jk, jb) +
                 ptr_rutri(jc, 11, jb) * p_coeff_view(6, jc, jk, jb) +
                 ptr_rutri(jc, 12, jb) * p_coeff_view(7, jc, jk, jb) +
@@ -680,7 +685,7 @@ void recon_lsq_cell_c(const T *p_cc, const int *lsq_idx_c, const int *lsq_blk_c,
                 ptr_rutri(jc, 14, jb) * p_coeff_view(9, jc, jk, jb)));
           p_coeff_view(3, jc, jk, jb) =
               ptr_rrdiag(jc, 2, jb) *
-              (z_qt_times_d(2) -
+              (z_qt_times_d[2] -
                (ptr_rutri(jc, 15, jb) * p_coeff_view(4, jc, jk, jb) +
                 ptr_rutri(jc, 16, jb) * p_coeff_view(5, jc, jk, jb) +
                 ptr_rutri(jc, 17, jb) * p_coeff_view(6, jc, jk, jb) +
@@ -689,7 +694,7 @@ void recon_lsq_cell_c(const T *p_cc, const int *lsq_idx_c, const int *lsq_blk_c,
                 ptr_rutri(jc, 20, jb) * p_coeff_view(9, jc, jk, jb)));
           p_coeff_view(2, jc, jk, jb) =
               ptr_rrdiag(jc, 1, jb) *
-              (z_qt_times_d(1) -
+              (z_qt_times_d[1] -
                (ptr_rutri(jc, 21, jb) * p_coeff_view(3, jc, jk, jb) +
                 ptr_rutri(jc, 22, jb) * p_coeff_view(4, jc, jk, jb) +
                 ptr_rutri(jc, 23, jb) * p_coeff_view(5, jc, jk, jb) +
@@ -699,7 +704,7 @@ void recon_lsq_cell_c(const T *p_cc, const int *lsq_idx_c, const int *lsq_blk_c,
                 ptr_rutri(jc, 27, jb) * p_coeff_view(9, jc, jk, jb)));
           p_coeff_view(1, jc, jk, jb) =
               ptr_rrdiag(jc, 0, jb) *
-              (z_qt_times_d(0) -
+              (z_qt_times_d[0] -
                (ptr_rutri(jc, 28, jb) * p_coeff_view(2, jc, jk, jb) +
                 ptr_rutri(jc, 29, jb) * p_coeff_view(3, jc, jk, jb) +
                 ptr_rutri(jc, 30, jb) * p_coeff_view(4, jc, jk, jb) +
@@ -748,7 +753,7 @@ void recon_lsq_cell_c_svd(const T *p_cc, const int *lsq_idx_c,
                        Kokkos::MemoryUnmanaged>
       UnmanagedConstInt3D;
 
-  Kokkos::View<T *> z_b("z_b", 9);
+  // Kokkos::View<T *> z_b("z_b", 9);
 
   UnmanagedConstInt3D iidx(lsq_idx_c, nproma, nblks_c, lsq_dim_c);
   UnmanagedConstInt3D iblk(lsq_blk_c, nproma, nblks_c, lsq_dim_c);
@@ -786,115 +791,116 @@ void recon_lsq_cell_c_svd(const T *p_cc, const int *lsq_idx_c,
     Kokkos::parallel_for(
         "recon_lsq_cell_c_svd_inner", innerPolicy,
         KOKKOS_LAMBDA(const int jk, const int jc) {
-          z_b(0) = p_cc_view(iidx(jc, jb, 0), jk, iblk(jc, jb, 0)) -
+          T z_b[9]; // Local array instead of shared View
+          z_b[0] = p_cc_view(iidx(jc, jb, 0), jk, iblk(jc, jb, 0)) -
                    p_cc_view(jc, jk, jb);
-          z_b(1) = p_cc_view(iidx(jc, jb, 1), jk, iblk(jc, jb, 1)) -
+          z_b[1] = p_cc_view(iidx(jc, jb, 1), jk, iblk(jc, jb, 1)) -
                    p_cc_view(jc, jk, jb);
-          z_b(2) = p_cc_view(iidx(jc, jb, 2), jk, iblk(jc, jb, 2)) -
+          z_b[2] = p_cc_view(iidx(jc, jb, 2), jk, iblk(jc, jb, 2)) -
                    p_cc_view(jc, jk, jb);
-          z_b(3) = p_cc_view(iidx(jc, jb, 3), jk, iblk(jc, jb, 3)) -
+          z_b[3] = p_cc_view(iidx(jc, jb, 3), jk, iblk(jc, jb, 3)) -
                    p_cc_view(jc, jk, jb);
-          z_b(4) = p_cc_view(iidx(jc, jb, 4), jk, iblk(jc, jb, 4)) -
+          z_b[4] = p_cc_view(iidx(jc, jb, 4), jk, iblk(jc, jb, 4)) -
                    p_cc_view(jc, jk, jb);
-          z_b(5) = p_cc_view(iidx(jc, jb, 5), jk, iblk(jc, jb, 5)) -
+          z_b[5] = p_cc_view(iidx(jc, jb, 5), jk, iblk(jc, jb, 5)) -
                    p_cc_view(jc, jk, jb);
-          z_b(6) = p_cc_view(iidx(jc, jb, 6), jk, iblk(jc, jb, 6)) -
+          z_b[6] = p_cc_view(iidx(jc, jb, 6), jk, iblk(jc, jb, 6)) -
                    p_cc_view(jc, jk, jb);
-          z_b(7) = p_cc_view(iidx(jc, jb, 7), jk, iblk(jc, jb, 7)) -
+          z_b[7] = p_cc_view(iidx(jc, jb, 7), jk, iblk(jc, jb, 7)) -
                    p_cc_view(jc, jk, jb);
-          z_b(8) = p_cc_view(iidx(jc, jb, 8), jk, iblk(jc, jb, 8)) -
+          z_b[8] = p_cc_view(iidx(jc, jb, 8), jk, iblk(jc, jb, 8)) -
                    p_cc_view(jc, jk, jb);
 
           p_coeff_view(9, jc, jk, jb) =
-              lsq_pseudoinv_view(jc, 8, 0, jb) * z_b(0) +
-              lsq_pseudoinv_view(jc, 8, 1, jb) * z_b(1) +
-              lsq_pseudoinv_view(jc, 8, 2, jb) * z_b(2) +
-              lsq_pseudoinv_view(jc, 8, 3, jb) * z_b(3) +
-              lsq_pseudoinv_view(jc, 8, 4, jb) * z_b(4) +
-              lsq_pseudoinv_view(jc, 8, 5, jb) * z_b(5) +
-              lsq_pseudoinv_view(jc, 8, 6, jb) * z_b(6) +
-              lsq_pseudoinv_view(jc, 8, 7, jb) * z_b(7) +
-              lsq_pseudoinv_view(jc, 8, 8, jb) * z_b(8);
+              lsq_pseudoinv_view(jc, 8, 0, jb) * z_b[0] +
+              lsq_pseudoinv_view(jc, 8, 1, jb) * z_b[1] +
+              lsq_pseudoinv_view(jc, 8, 2, jb) * z_b[2] +
+              lsq_pseudoinv_view(jc, 8, 3, jb) * z_b[3] +
+              lsq_pseudoinv_view(jc, 8, 4, jb) * z_b[4] +
+              lsq_pseudoinv_view(jc, 8, 5, jb) * z_b[5] +
+              lsq_pseudoinv_view(jc, 8, 6, jb) * z_b[6] +
+              lsq_pseudoinv_view(jc, 8, 7, jb) * z_b[7] +
+              lsq_pseudoinv_view(jc, 8, 8, jb) * z_b[8];
           p_coeff_view(8, jc, jk, jb) =
-              lsq_pseudoinv_view(jc, 7, 0, jb) * z_b(0) +
-              lsq_pseudoinv_view(jc, 7, 1, jb) * z_b(1) +
-              lsq_pseudoinv_view(jc, 7, 2, jb) * z_b(2) +
-              lsq_pseudoinv_view(jc, 7, 3, jb) * z_b(3) +
-              lsq_pseudoinv_view(jc, 7, 4, jb) * z_b(4) +
-              lsq_pseudoinv_view(jc, 7, 5, jb) * z_b(5) +
-              lsq_pseudoinv_view(jc, 7, 6, jb) * z_b(6) +
-              lsq_pseudoinv_view(jc, 7, 7, jb) * z_b(7) +
-              lsq_pseudoinv_view(jc, 7, 8, jb) * z_b(8);
+              lsq_pseudoinv_view(jc, 7, 0, jb) * z_b[0] +
+              lsq_pseudoinv_view(jc, 7, 1, jb) * z_b[1] +
+              lsq_pseudoinv_view(jc, 7, 2, jb) * z_b[2] +
+              lsq_pseudoinv_view(jc, 7, 3, jb) * z_b[3] +
+              lsq_pseudoinv_view(jc, 7, 4, jb) * z_b[4] +
+              lsq_pseudoinv_view(jc, 7, 5, jb) * z_b[5] +
+              lsq_pseudoinv_view(jc, 7, 6, jb) * z_b[6] +
+              lsq_pseudoinv_view(jc, 7, 7, jb) * z_b[7] +
+              lsq_pseudoinv_view(jc, 7, 8, jb) * z_b[8];
           p_coeff_view(7, jc, jk, jb) =
-              lsq_pseudoinv_view(jc, 6, 0, jb) * z_b(0) +
-              lsq_pseudoinv_view(jc, 6, 1, jb) * z_b(1) +
-              lsq_pseudoinv_view(jc, 6, 2, jb) * z_b(2) +
-              lsq_pseudoinv_view(jc, 6, 3, jb) * z_b(3) +
-              lsq_pseudoinv_view(jc, 6, 4, jb) * z_b(4) +
-              lsq_pseudoinv_view(jc, 6, 5, jb) * z_b(5) +
-              lsq_pseudoinv_view(jc, 6, 6, jb) * z_b(6) +
-              lsq_pseudoinv_view(jc, 6, 7, jb) * z_b(7) +
-              lsq_pseudoinv_view(jc, 6, 8, jb) * z_b(8);
+              lsq_pseudoinv_view(jc, 6, 0, jb) * z_b[0] +
+              lsq_pseudoinv_view(jc, 6, 1, jb) * z_b[1] +
+              lsq_pseudoinv_view(jc, 6, 2, jb) * z_b[2] +
+              lsq_pseudoinv_view(jc, 6, 3, jb) * z_b[3] +
+              lsq_pseudoinv_view(jc, 6, 4, jb) * z_b[4] +
+              lsq_pseudoinv_view(jc, 6, 5, jb) * z_b[5] +
+              lsq_pseudoinv_view(jc, 6, 6, jb) * z_b[6] +
+              lsq_pseudoinv_view(jc, 6, 7, jb) * z_b[7] +
+              lsq_pseudoinv_view(jc, 6, 8, jb) * z_b[8];
           p_coeff_view(6, jc, jk, jb) =
-              lsq_pseudoinv_view(jc, 5, 0, jb) * z_b(0) +
-              lsq_pseudoinv_view(jc, 5, 1, jb) * z_b(1) +
-              lsq_pseudoinv_view(jc, 5, 2, jb) * z_b(2) +
-              lsq_pseudoinv_view(jc, 5, 3, jb) * z_b(3) +
-              lsq_pseudoinv_view(jc, 5, 4, jb) * z_b(4) +
-              lsq_pseudoinv_view(jc, 5, 5, jb) * z_b(5) +
-              lsq_pseudoinv_view(jc, 5, 6, jb) * z_b(6) +
-              lsq_pseudoinv_view(jc, 5, 7, jb) * z_b(7) +
-              lsq_pseudoinv_view(jc, 5, 8, jb) * z_b(8);
+              lsq_pseudoinv_view(jc, 5, 0, jb) * z_b[0] +
+              lsq_pseudoinv_view(jc, 5, 1, jb) * z_b[1] +
+              lsq_pseudoinv_view(jc, 5, 2, jb) * z_b[2] +
+              lsq_pseudoinv_view(jc, 5, 3, jb) * z_b[3] +
+              lsq_pseudoinv_view(jc, 5, 4, jb) * z_b[4] +
+              lsq_pseudoinv_view(jc, 5, 5, jb) * z_b[5] +
+              lsq_pseudoinv_view(jc, 5, 6, jb) * z_b[6] +
+              lsq_pseudoinv_view(jc, 5, 7, jb) * z_b[7] +
+              lsq_pseudoinv_view(jc, 5, 8, jb) * z_b[8];
           p_coeff_view(5, jc, jk, jb) =
-              lsq_pseudoinv_view(jc, 4, 0, jb) * z_b(0) +
-              lsq_pseudoinv_view(jc, 4, 1, jb) * z_b(1) +
-              lsq_pseudoinv_view(jc, 4, 2, jb) * z_b(2) +
-              lsq_pseudoinv_view(jc, 4, 3, jb) * z_b(3) +
-              lsq_pseudoinv_view(jc, 4, 4, jb) * z_b(4) +
-              lsq_pseudoinv_view(jc, 4, 5, jb) * z_b(5) +
-              lsq_pseudoinv_view(jc, 4, 6, jb) * z_b(6) +
-              lsq_pseudoinv_view(jc, 4, 7, jb) * z_b(7) +
-              lsq_pseudoinv_view(jc, 4, 8, jb) * z_b(8);
+              lsq_pseudoinv_view(jc, 4, 0, jb) * z_b[0] +
+              lsq_pseudoinv_view(jc, 4, 1, jb) * z_b[1] +
+              lsq_pseudoinv_view(jc, 4, 2, jb) * z_b[2] +
+              lsq_pseudoinv_view(jc, 4, 3, jb) * z_b[3] +
+              lsq_pseudoinv_view(jc, 4, 4, jb) * z_b[4] +
+              lsq_pseudoinv_view(jc, 4, 5, jb) * z_b[5] +
+              lsq_pseudoinv_view(jc, 4, 6, jb) * z_b[6] +
+              lsq_pseudoinv_view(jc, 4, 7, jb) * z_b[7] +
+              lsq_pseudoinv_view(jc, 4, 8, jb) * z_b[8];
           p_coeff_view(4, jc, jk, jb) =
-              lsq_pseudoinv_view(jc, 3, 0, jb) * z_b(0) +
-              lsq_pseudoinv_view(jc, 3, 1, jb) * z_b(1) +
-              lsq_pseudoinv_view(jc, 3, 2, jb) * z_b(2) +
-              lsq_pseudoinv_view(jc, 3, 3, jb) * z_b(3) +
-              lsq_pseudoinv_view(jc, 3, 4, jb) * z_b(4) +
-              lsq_pseudoinv_view(jc, 3, 5, jb) * z_b(5) +
-              lsq_pseudoinv_view(jc, 3, 6, jb) * z_b(6) +
-              lsq_pseudoinv_view(jc, 3, 7, jb) * z_b(7) +
-              lsq_pseudoinv_view(jc, 3, 8, jb) * z_b(8);
+              lsq_pseudoinv_view(jc, 3, 0, jb) * z_b[0] +
+              lsq_pseudoinv_view(jc, 3, 1, jb) * z_b[1] +
+              lsq_pseudoinv_view(jc, 3, 2, jb) * z_b[2] +
+              lsq_pseudoinv_view(jc, 3, 3, jb) * z_b[3] +
+              lsq_pseudoinv_view(jc, 3, 4, jb) * z_b[4] +
+              lsq_pseudoinv_view(jc, 3, 5, jb) * z_b[5] +
+              lsq_pseudoinv_view(jc, 3, 6, jb) * z_b[6] +
+              lsq_pseudoinv_view(jc, 3, 7, jb) * z_b[7] +
+              lsq_pseudoinv_view(jc, 3, 8, jb) * z_b[8];
           p_coeff_view(3, jc, jk, jb) =
-              lsq_pseudoinv_view(jc, 2, 0, jb) * z_b(0) +
-              lsq_pseudoinv_view(jc, 2, 1, jb) * z_b(1) +
-              lsq_pseudoinv_view(jc, 2, 2, jb) * z_b(2) +
-              lsq_pseudoinv_view(jc, 2, 3, jb) * z_b(3) +
-              lsq_pseudoinv_view(jc, 2, 4, jb) * z_b(4) +
-              lsq_pseudoinv_view(jc, 2, 5, jb) * z_b(5) +
-              lsq_pseudoinv_view(jc, 2, 6, jb) * z_b(6) +
-              lsq_pseudoinv_view(jc, 2, 7, jb) * z_b(7) +
-              lsq_pseudoinv_view(jc, 2, 8, jb) * z_b(8);
+              lsq_pseudoinv_view(jc, 2, 0, jb) * z_b[0] +
+              lsq_pseudoinv_view(jc, 2, 1, jb) * z_b[1] +
+              lsq_pseudoinv_view(jc, 2, 2, jb) * z_b[2] +
+              lsq_pseudoinv_view(jc, 2, 3, jb) * z_b[3] +
+              lsq_pseudoinv_view(jc, 2, 4, jb) * z_b[4] +
+              lsq_pseudoinv_view(jc, 2, 5, jb) * z_b[5] +
+              lsq_pseudoinv_view(jc, 2, 6, jb) * z_b[6] +
+              lsq_pseudoinv_view(jc, 2, 7, jb) * z_b[7] +
+              lsq_pseudoinv_view(jc, 2, 8, jb) * z_b[8];
           p_coeff_view(2, jc, jk, jb) =
-              lsq_pseudoinv_view(jc, 1, 0, jb) * z_b(0) +
-              lsq_pseudoinv_view(jc, 1, 1, jb) * z_b(1) +
-              lsq_pseudoinv_view(jc, 1, 2, jb) * z_b(2) +
-              lsq_pseudoinv_view(jc, 1, 3, jb) * z_b(3) +
-              lsq_pseudoinv_view(jc, 1, 4, jb) * z_b(4) +
-              lsq_pseudoinv_view(jc, 1, 5, jb) * z_b(5) +
-              lsq_pseudoinv_view(jc, 1, 6, jb) * z_b(6) +
-              lsq_pseudoinv_view(jc, 1, 7, jb) * z_b(7) +
-              lsq_pseudoinv_view(jc, 1, 8, jb) * z_b(8);
+              lsq_pseudoinv_view(jc, 1, 0, jb) * z_b[0] +
+              lsq_pseudoinv_view(jc, 1, 1, jb) * z_b[1] +
+              lsq_pseudoinv_view(jc, 1, 2, jb) * z_b[2] +
+              lsq_pseudoinv_view(jc, 1, 3, jb) * z_b[3] +
+              lsq_pseudoinv_view(jc, 1, 4, jb) * z_b[4] +
+              lsq_pseudoinv_view(jc, 1, 5, jb) * z_b[5] +
+              lsq_pseudoinv_view(jc, 1, 6, jb) * z_b[6] +
+              lsq_pseudoinv_view(jc, 1, 7, jb) * z_b[7] +
+              lsq_pseudoinv_view(jc, 1, 8, jb) * z_b[8];
           p_coeff_view(1, jc, jk, jb) =
-              lsq_pseudoinv_view(jc, 0, 0, jb) * z_b(0) +
-              lsq_pseudoinv_view(jc, 0, 1, jb) * z_b(1) +
-              lsq_pseudoinv_view(jc, 0, 2, jb) * z_b(2) +
-              lsq_pseudoinv_view(jc, 0, 3, jb) * z_b(3) +
-              lsq_pseudoinv_view(jc, 0, 4, jb) * z_b(4) +
-              lsq_pseudoinv_view(jc, 0, 5, jb) * z_b(5) +
-              lsq_pseudoinv_view(jc, 0, 6, jb) * z_b(6) +
-              lsq_pseudoinv_view(jc, 0, 7, jb) * z_b(7) +
-              lsq_pseudoinv_view(jc, 0, 8, jb) * z_b(8);
+              lsq_pseudoinv_view(jc, 0, 0, jb) * z_b[0] +
+              lsq_pseudoinv_view(jc, 0, 1, jb) * z_b[1] +
+              lsq_pseudoinv_view(jc, 0, 2, jb) * z_b[2] +
+              lsq_pseudoinv_view(jc, 0, 3, jb) * z_b[3] +
+              lsq_pseudoinv_view(jc, 0, 4, jb) * z_b[4] +
+              lsq_pseudoinv_view(jc, 0, 5, jb) * z_b[5] +
+              lsq_pseudoinv_view(jc, 0, 6, jb) * z_b[6] +
+              lsq_pseudoinv_view(jc, 0, 7, jb) * z_b[7] +
+              lsq_pseudoinv_view(jc, 0, 8, jb) * z_b[8];
           p_coeff_view(0, jc, jk, jb) =
               p_cc_view(jc, jk, jb) -
               p_coeff_view(1, jc, jk, jb) * lsq_moments_view(jc, jb, 0) -
diff --git a/test/c/CMakeLists.txt b/test/c/CMakeLists.txt
index 0b42439..f4c5e27 100644
--- a/test/c/CMakeLists.txt
+++ b/test/c/CMakeLists.txt
@@ -33,8 +33,8 @@ endif()
 set(SOURCES
   main.cpp
   test_horizontal_div.cpp
-  # test_horizontal_recon.cpp
   # test_horizontal_rot.cpp
+  test_horizontal_recon.cpp
   # test_tdma_solver.cpp
   # test_interpolation_vector.cpp
   # test_intp_rbf.cpp
diff --git a/test/c/test_horizontal_recon.cpp b/test/c/test_horizontal_recon.cpp
index 8938a10..089c58f 100644
--- a/test/c/test_horizontal_recon.cpp
+++ b/test/c/test_horizontal_recon.cpp
@@ -14,8 +14,8 @@
 #include <vector>
 
 #include <Kokkos_Core.hpp>
-#include <gtest/gtest.h>
 #include <dim_helper.hpp>
+#include <gtest/gtest.h>
 #include <horizontal/mo_lib_divrot.hpp>
 #include <support/mo_lib_loopindices.hpp>
 
@@ -26,8 +26,8 @@ enum class ReconstructionMethod {
   cubic,
 };
 
-/// Base test class for the horizontal reconstruct tests. Templated for the ValueType
-/// and ReconMethod for the reconstruction method.
+/// Base test class for the horizontal reconstruct tests. Templated for the
+/// ValueType and ReconMethod for the reconstruction method.
 template <typename ValueType, int ReconMethod>
 class HorizontalReconTest : public ::testing::Test {
 protected:
@@ -66,28 +66,40 @@ protected:
   bool l_consv = true;        // With conservative correction.
   bool l_limited_area = true; // Limited area setup
 
-  std::vector<ValueType> p_cc;
-  std::vector<int> cell_neighbor_idx;
-  std::vector<int> cell_neighbor_blk;
-  std::vector<ValueType> lsq_qtmat_c;
-  std::vector<ValueType> lsq_rmat_rdiag_c;
-  std::vector<ValueType> lsq_rmat_utri_c;
-  std::vector<ValueType> lsq_moments;
-  std::vector<ValueType> lsq_pseudoinv;
-  std::vector<ValueType> p_coeff;
-
-  HorizontalReconTest() {
-    p_cc.resize(dim_combine(nproma, nlev, nblks_c));
-    cell_neighbor_idx.resize(dim_combine(nproma, nblks_c, lsq_dim_c));
-    cell_neighbor_blk.resize(dim_combine(nproma, nblks_c, lsq_dim_c));
-    lsq_qtmat_c.resize(dim_combine(nproma, lsq_dim_unk, lsq_dim_c, nblks_c));
-    lsq_rmat_rdiag_c.resize(dim_combine(nproma, lsq_dim_unk, nblks_c));
-    lsq_rmat_utri_c.resize(dim_combine(
-        nproma, (lsq_dim_unk * lsq_dim_unk - lsq_dim_unk) / 2, nblks_c));
-    lsq_moments.resize(dim_combine(nproma, nblks_c, lsq_dim_unk));
-    lsq_pseudoinv.resize(dim_combine(nproma, lsq_dim_unk, lsq_dim_c, nblks_c));
-    p_coeff.resize(dim_combine(lsq_dim_unk + 1, nproma, nlev, nblks_c));
-  }
+  // Using Kokkos execution and memory spaces
+  using exec_space = Kokkos::DefaultExecutionSpace;
+  using memory_space = exec_space::memory_space;
+
+  // Kokkos Views for test data
+  Kokkos::View<ValueType *, memory_space> p_cc;
+  Kokkos::View<int *, memory_space> cell_neighbor_idx;
+  Kokkos::View<int *, memory_space> cell_neighbor_blk;
+  Kokkos::View<ValueType *, memory_space> lsq_qtmat_c;
+  Kokkos::View<ValueType *, memory_space> lsq_rmat_rdiag_c;
+  Kokkos::View<ValueType *, memory_space> lsq_rmat_utri_c;
+  Kokkos::View<ValueType *, memory_space> lsq_moments;
+  Kokkos::View<ValueType *, memory_space> lsq_pseudoinv;
+  Kokkos::View<ValueType *, memory_space> p_coeff;
+
+  HorizontalReconTest()
+      : p_cc("p_cc", dim_combine(nproma, nlev, nblks_c)),
+        cell_neighbor_idx("cell_neighbor_idx",
+                          dim_combine(nproma, nblks_c, lsq_dim_c)),
+        cell_neighbor_blk("cell_neighbor_blk",
+                          dim_combine(nproma, nblks_c, lsq_dim_c)),
+        lsq_qtmat_c("lsq_qtmat_c",
+                    dim_combine(nproma, lsq_dim_unk, lsq_dim_c, nblks_c)),
+        lsq_rmat_rdiag_c("lsq_rmat_rdiag_c",
+                         dim_combine(nproma, lsq_dim_unk, nblks_c)),
+        lsq_rmat_utri_c(
+            "lsq_rmat_utri_c",
+            dim_combine(nproma, (lsq_dim_unk * lsq_dim_unk - lsq_dim_unk) / 2,
+                        nblks_c)),
+        lsq_moments("lsq_moments", dim_combine(nproma, nblks_c, lsq_dim_unk)),
+        lsq_pseudoinv("lsq_pseudoinv",
+                      dim_combine(nproma, lsq_dim_unk, lsq_dim_c, nblks_c)),
+        p_coeff("p_coeff",
+                dim_combine(lsq_dim_unk + 1, nproma, nlev, nblks_c)) {}
 };
 
 /// Test class for the horizontal tests. The reconstruction method is specified
@@ -134,30 +146,54 @@ TYPED_TEST(HorizontalReconLinearTest, TestLsqCell) {
       at<nproma, (lsq_dim_unk * lsq_dim_unk - lsq_dim_unk) / 2, nblks_c>;
   const auto &moments_at = at<nproma, nblks_c, lsq_dim_unk>;
 
+  // Create host mirror views
+  auto p_cc_h = Kokkos::create_mirror_view(this->p_cc);
+  auto cell_neighbor_idx_h =
+      Kokkos::create_mirror_view(this->cell_neighbor_idx);
+  auto cell_neighbor_blk_h =
+      Kokkos::create_mirror_view(this->cell_neighbor_blk);
+  auto lsq_qtmat_c_h = Kokkos::create_mirror_view(this->lsq_qtmat_c);
+  auto lsq_rmat_rdiag_c_h = Kokkos::create_mirror_view(this->lsq_rmat_rdiag_c);
+  auto lsq_rmat_utri_c_h = Kokkos::create_mirror_view(this->lsq_rmat_utri_c);
+  auto lsq_moments_h = Kokkos::create_mirror_view(this->lsq_moments);
+  auto p_coeff_h = Kokkos::create_mirror_view(this->p_coeff);
+
   // Initialization
   for (int i = 0; i < nproma; ++i) {
-    this->p_cc[p_cc_at(i, 0, 0)] = (i + 1);
+    p_cc_h[p_cc_at(i, 0, 0)] = static_cast<TypeParam>(i + 1);
+
+    cell_neighbor_idx_h[cell_neighbor_at(i, 0, 0)] = (i + 1) % nproma;
+    cell_neighbor_idx_h[cell_neighbor_at(i, 0, 1)] = i;
+    cell_neighbor_idx_h[cell_neighbor_at(i, 0, 2)] = i;
 
-    this->cell_neighbor_idx[cell_neighbor_at(i, 0, 0)] = (i + 1) % nproma;
-    this->cell_neighbor_idx[cell_neighbor_at(i, 0, 1)] = i;
-    this->cell_neighbor_idx[cell_neighbor_at(i, 0, 2)] = i;
     for (int j = 0; j < lsq_dim_c; ++j) {
-      this->cell_neighbor_blk[cell_neighbor_at(i, 0, j)] = 0;
-      this->lsq_qtmat_c[qtmat_at(i, 0, j, 0)] = 1.0;
-      this->lsq_qtmat_c[qtmat_at(i, 1, j, 0)] = 0.5;
+      cell_neighbor_blk_h[cell_neighbor_at(i, 0, j)] = 0;
+      lsq_qtmat_c_h[qtmat_at(i, 0, j, 0)] = static_cast<TypeParam>(1.0);
+      lsq_qtmat_c_h[qtmat_at(i, 1, j, 0)] = static_cast<TypeParam>(0.5);
     }
+
     for (int j = 0; j < lsq_dim_unk + 1; ++j) {
-      this->p_coeff[p_coeff_at(j, i, 0, 0)] = 0.0;
+      p_coeff_h[p_coeff_at(j, i, 0, 0)] = static_cast<TypeParam>(0.0);
     }
 
-    this->lsq_rmat_rdiag_c[rmat_rdiag_at(i, 0, 0)] = 2.0;
-    this->lsq_rmat_rdiag_c[rmat_rdiag_at(i, 1, 0)] = 2.0;
-    this->lsq_rmat_utri_c[rmat_utri_at(i, 0, 0)] = 0.1;
+    lsq_rmat_rdiag_c_h[rmat_rdiag_at(i, 0, 0)] = static_cast<TypeParam>(2.0);
+    lsq_rmat_rdiag_c_h[rmat_rdiag_at(i, 1, 0)] = static_cast<TypeParam>(2.0);
+    lsq_rmat_utri_c_h[rmat_utri_at(i, 0, 0)] = static_cast<TypeParam>(0.1);
 
-    this->lsq_moments[moments_at(i, 0, 0)] = 0.2;
-    this->lsq_moments[moments_at(i, 0, 1)] = 0.3;
+    lsq_moments_h[moments_at(i, 0, 0)] = static_cast<TypeParam>(0.2);
+    lsq_moments_h[moments_at(i, 0, 1)] = static_cast<TypeParam>(0.3);
   }
 
+  // Copy to device
+  Kokkos::deep_copy(this->p_cc, p_cc_h);
+  Kokkos::deep_copy(this->cell_neighbor_idx, cell_neighbor_idx_h);
+  Kokkos::deep_copy(this->cell_neighbor_blk, cell_neighbor_blk_h);
+  Kokkos::deep_copy(this->lsq_qtmat_c, lsq_qtmat_c_h);
+  Kokkos::deep_copy(this->lsq_rmat_rdiag_c, lsq_rmat_rdiag_c_h);
+  Kokkos::deep_copy(this->lsq_rmat_utri_c, lsq_rmat_utri_c_h);
+  Kokkos::deep_copy(this->lsq_moments, lsq_moments_h);
+  Kokkos::deep_copy(this->p_coeff, p_coeff_h);
+
   // Test function
   recon_lsq_cell_l<TypeParam>(
       this->p_cc.data(), this->cell_neighbor_idx.data(),
@@ -168,16 +204,19 @@ TYPED_TEST(HorizontalReconLinearTest, TestLsqCell) {
       this->elev, this->nproma, this->l_consv, this->lacc, this->acc_async,
       this->nblks_c, this->nlev, this->lsq_dim_unk, this->lsq_dim_c);
 
+  // Copy results back to host
+  Kokkos::deep_copy(p_coeff_h, this->p_coeff);
+
   // Check result
   EXPECT_NEAR(
-      this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(0, 0, 0, 0))],
-      0.34, 1e-6);
+      p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(0, 0, 0, 0))],
+      static_cast<TypeParam>(0.34), 1e-6);
   EXPECT_NEAR(
-      this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(1, 0, 0, 0))],
-      1.8, 1e-6);
+      p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(1, 0, 0, 0))],
+      static_cast<TypeParam>(1.8), 1e-6);
   EXPECT_NEAR(
-      this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(2, 0, 0, 0))],
-      1.0, 1e-6);
+      p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(2, 0, 0, 0))],
+      static_cast<TypeParam>(1.0), 1e-6);
 }
 
 TYPED_TEST(HorizontalReconLinearTest, TestLsqCellRandom) {
@@ -196,6 +235,18 @@ TYPED_TEST(HorizontalReconLinearTest, TestLsqCellRandom) {
       at<nproma, (lsq_dim_unk * lsq_dim_unk - lsq_dim_unk) / 2, nblks_c>;
   const auto &moments_at = at<nproma, nblks_c, lsq_dim_unk>;
 
+  // Create host mirror views
+  auto p_cc_h = Kokkos::create_mirror_view(this->p_cc);
+  auto cell_neighbor_idx_h =
+      Kokkos::create_mirror_view(this->cell_neighbor_idx);
+  auto cell_neighbor_blk_h =
+      Kokkos::create_mirror_view(this->cell_neighbor_blk);
+  auto lsq_qtmat_c_h = Kokkos::create_mirror_view(this->lsq_qtmat_c);
+  auto lsq_rmat_rdiag_c_h = Kokkos::create_mirror_view(this->lsq_rmat_rdiag_c);
+  auto lsq_rmat_utri_c_h = Kokkos::create_mirror_view(this->lsq_rmat_utri_c);
+  auto lsq_moments_h = Kokkos::create_mirror_view(this->lsq_moments);
+  auto p_coeff_h = Kokkos::create_mirror_view(this->p_coeff);
+
   std::random_device rd;
   std::mt19937 gen(rd());
   std::uniform_int_distribution<int> int_distrib(0, nproma - 1);
@@ -203,26 +254,37 @@ TYPED_TEST(HorizontalReconLinearTest, TestLsqCellRandom) {
 
   // Initialization
   for (int i = 0; i < nproma; ++i) {
-    this->p_cc[p_cc_at(i, 0, 0)] = real_distrib(gen);
+    p_cc_h[p_cc_at(i, 0, 0)] = real_distrib(gen);
 
     for (int j = 0; j < lsq_dim_c; ++j) {
-      this->cell_neighbor_idx[cell_neighbor_at(i, 0, j)] = int_distrib(gen);
-      this->cell_neighbor_blk[cell_neighbor_at(i, 0, j)] = 0;
-      this->lsq_qtmat_c[qtmat_at(i, 0, j, 0)] = real_distrib(gen);
-      this->lsq_qtmat_c[qtmat_at(i, 1, j, 0)] = real_distrib(gen);
+      cell_neighbor_idx_h[cell_neighbor_at(i, 0, j)] = int_distrib(gen);
+      cell_neighbor_blk_h[cell_neighbor_at(i, 0, j)] = 0;
+      lsq_qtmat_c_h[qtmat_at(i, 0, j, 0)] = real_distrib(gen);
+      lsq_qtmat_c_h[qtmat_at(i, 1, j, 0)] = real_distrib(gen);
     }
+
     for (int j = 0; j < lsq_dim_unk + 1; ++j) {
-      this->p_coeff[p_coeff_at(j, i, 0, 0)] = real_distrib(gen);
+      p_coeff_h[p_coeff_at(j, i, 0, 0)] = static_cast<TypeParam>(0.0);
     }
 
-    this->lsq_rmat_rdiag_c[rmat_rdiag_at(i, 0, 0)] = real_distrib(gen);
-    this->lsq_rmat_rdiag_c[rmat_rdiag_at(i, 1, 0)] = real_distrib(gen);
-    this->lsq_rmat_utri_c[rmat_utri_at(i, 0, 0)] = real_distrib(gen);
+    lsq_rmat_rdiag_c_h[rmat_rdiag_at(i, 0, 0)] = real_distrib(gen);
+    lsq_rmat_rdiag_c_h[rmat_rdiag_at(i, 1, 0)] = real_distrib(gen);
+    lsq_rmat_utri_c_h[rmat_utri_at(i, 0, 0)] = real_distrib(gen);
 
-    this->lsq_moments[moments_at(i, 0, 0)] = real_distrib(gen);
-    this->lsq_moments[moments_at(i, 0, 1)] = real_distrib(gen);
+    lsq_moments_h[moments_at(i, 0, 0)] = real_distrib(gen);
+    lsq_moments_h[moments_at(i, 0, 1)] = real_distrib(gen);
   }
 
+  // Copy to device
+  Kokkos::deep_copy(this->p_cc, p_cc_h);
+  Kokkos::deep_copy(this->cell_neighbor_idx, cell_neighbor_idx_h);
+  Kokkos::deep_copy(this->cell_neighbor_blk, cell_neighbor_blk_h);
+  Kokkos::deep_copy(this->lsq_qtmat_c, lsq_qtmat_c_h);
+  Kokkos::deep_copy(this->lsq_rmat_rdiag_c, lsq_rmat_rdiag_c_h);
+  Kokkos::deep_copy(this->lsq_rmat_utri_c, lsq_rmat_utri_c_h);
+  Kokkos::deep_copy(this->lsq_moments, lsq_moments_h);
+  Kokkos::deep_copy(this->p_coeff, p_coeff_h);
+
   // Test function
   recon_lsq_cell_l<TypeParam>(
       this->p_cc.data(), this->cell_neighbor_idx.data(),
@@ -233,10 +295,14 @@ TYPED_TEST(HorizontalReconLinearTest, TestLsqCellRandom) {
       this->elev, this->nproma, this->l_consv, this->lacc, this->acc_async,
       this->nblks_c, this->nlev, this->lsq_dim_unk, this->lsq_dim_c);
 
+  // Copy results back to host
+  Kokkos::deep_copy(p_coeff_h, this->p_coeff);
+
   // Compute reference result
   std::vector<TypeParam> z_d(lsq_dim_c);
   std::vector<TypeParam> z_qt_times_d(lsq_dim_unk);
   std::vector<TypeParam> p_result((lsq_dim_unk + 1) * nproma);
+
   for (int jb = this->i_startblk; jb < this->i_endblk; ++jb) {
     int i_startidx, i_endidx;
     get_indices_c_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb,
@@ -244,30 +310,30 @@ TYPED_TEST(HorizontalReconLinearTest, TestLsqCellRandom) {
     for (int jk = this->slev; jk < this->elev; ++jk) {
       for (int jc = i_startidx; jc < i_endidx; ++jc) {
         for (int i = 0; i < lsq_dim_c; ++i) {
-          z_d[i] = this->p_cc[p_cc_at(
-                       this->cell_neighbor_idx[cell_neighbor_at(jc, jb, i)], jk,
-                       this->cell_neighbor_blk[cell_neighbor_at(jc, jb, i)])] -
-                   this->p_cc[p_cc_at(jc, jk, jb)];
+          z_d[i] = p_cc_h[p_cc_at(
+                       cell_neighbor_idx_h[cell_neighbor_at(jc, jb, i)], jk,
+                       cell_neighbor_blk_h[cell_neighbor_at(jc, jb, i)])] -
+                   p_cc_h[p_cc_at(jc, jk, jb)];
         }
         z_qt_times_d[0] = 0.0;
         z_qt_times_d[1] = 0.0;
         for (int i = 0; i < lsq_dim_c; ++i) {
-          z_qt_times_d[0] += this->lsq_qtmat_c[qtmat_at(jc, 0, i, jb)] * z_d[i];
-          z_qt_times_d[1] += this->lsq_qtmat_c[qtmat_at(jc, 1, i, jb)] * z_d[i];
+          z_qt_times_d[0] += lsq_qtmat_c_h[qtmat_at(jc, 0, i, jb)] * z_d[i];
+          z_qt_times_d[1] += lsq_qtmat_c_h[qtmat_at(jc, 1, i, jb)] * z_d[i];
         }
         p_result[at<lsq_dim_unk + 1, nproma>(2, jc)] =
-            this->lsq_rmat_rdiag_c[rmat_rdiag_at(jc, 1, jb)] * z_qt_times_d[1];
+            lsq_rmat_rdiag_c_h[rmat_rdiag_at(jc, 1, jb)] * z_qt_times_d[1];
         p_result[at<lsq_dim_unk + 1, nproma>(1, jc)] =
-            this->lsq_rmat_rdiag_c[rmat_rdiag_at(jc, 0, jb)] *
+            lsq_rmat_rdiag_c_h[rmat_rdiag_at(jc, 0, jb)] *
             (z_qt_times_d[0] -
-             this->lsq_rmat_utri_c[rmat_utri_at(jc, 0, jb)] *
+             lsq_rmat_utri_c_h[rmat_utri_at(jc, 0, jb)] *
                  p_result[at<lsq_dim_unk + 1, nproma>(2, jc)]);
         p_result[at<lsq_dim_unk + 1, nproma>(0, jc)] =
-            this->p_cc[p_cc_at(jc, jk, jb)] -
+            p_cc_h[p_cc_at(jc, jk, jb)] -
             p_result[at<lsq_dim_unk + 1, nproma>(1, jc)] *
-                this->lsq_moments[moments_at(jc, jb, 0)] -
+                lsq_moments_h[moments_at(jc, jb, 0)] -
             p_result[at<lsq_dim_unk + 1, nproma>(2, jc)] *
-                this->lsq_moments[moments_at(jc, jb, 1)];
+                lsq_moments_h[moments_at(jc, jb, 1)];
       }
     }
   }
@@ -275,7 +341,7 @@ TYPED_TEST(HorizontalReconLinearTest, TestLsqCellRandom) {
   // Check result
   for (int i = 0; i < lsq_dim_unk + 1; ++i) {
     for (int jc = 0; jc < nproma; ++jc) {
-      EXPECT_NEAR(this->p_coeff[(p_coeff_at(i, jc, 0, 0))],
+      EXPECT_NEAR(p_coeff_h[(p_coeff_at(i, jc, 0, 0))],
                   p_result[(at<lsq_dim_unk + 1, nproma>(i, jc))], 1e-5)
           << "For loop result fails for i = " << i << ", jc = " << jc;
     }
@@ -295,26 +361,46 @@ TYPED_TEST(HorizontalReconLinearTest, TestLsqCellSVD) {
   const auto &p_coeff_at = at<lsq_dim_unk + 1, nproma, nlev, nblks_c>;
   const auto &moments_at = at<nproma, nblks_c, lsq_dim_unk>;
 
+  // Create host mirror views
+  auto p_cc_h = Kokkos::create_mirror_view(this->p_cc);
+  auto cell_neighbor_idx_h =
+      Kokkos::create_mirror_view(this->cell_neighbor_idx);
+  auto cell_neighbor_blk_h =
+      Kokkos::create_mirror_view(this->cell_neighbor_blk);
+  auto lsq_pseudoinv_h = Kokkos::create_mirror_view(this->lsq_pseudoinv);
+  auto lsq_moments_h = Kokkos::create_mirror_view(this->lsq_moments);
+  auto p_coeff_h = Kokkos::create_mirror_view(this->p_coeff);
+
   // Initialization
   for (int i = 0; i < nproma; ++i) {
-    this->p_cc[p_cc_at(i, 0, 0)] = (i + 1);
+    p_cc_h[p_cc_at(i, 0, 0)] = static_cast<TypeParam>(i + 1);
+
+    cell_neighbor_idx_h[cell_neighbor_at(i, 0, 0)] = (i + 1) % nproma;
+    cell_neighbor_idx_h[cell_neighbor_at(i, 0, 1)] = i;
+    cell_neighbor_idx_h[cell_neighbor_at(i, 0, 2)] = i;
 
-    this->cell_neighbor_idx[cell_neighbor_at(i, 0, 0)] = (i + 1) % nproma;
-    this->cell_neighbor_idx[cell_neighbor_at(i, 0, 1)] = i;
-    this->cell_neighbor_idx[cell_neighbor_at(i, 0, 2)] = i;
     for (int j = 0; j < lsq_dim_c; ++j) {
-      this->cell_neighbor_blk[cell_neighbor_at(i, 0, j)] = 0;
-      this->lsq_pseudoinv[pseudoinv_at(i, 0, j, 0)] = 1.0;
-      this->lsq_pseudoinv[pseudoinv_at(i, 1, j, 0)] = 0.5;
+      cell_neighbor_blk_h[cell_neighbor_at(i, 0, j)] = 0;
+      lsq_pseudoinv_h[pseudoinv_at(i, 0, j, 0)] = static_cast<TypeParam>(1.0);
+      lsq_pseudoinv_h[pseudoinv_at(i, 1, j, 0)] = static_cast<TypeParam>(0.5);
     }
+
     for (int j = 0; j < lsq_dim_unk + 1; ++j) {
-      this->p_coeff[p_coeff_at(j, i, 0, 0)] = 0.0;
+      p_coeff_h[p_coeff_at(j, i, 0, 0)] = static_cast<TypeParam>(0.0);
     }
 
-    this->lsq_moments[moments_at(i, 0, 0)] = 0.2;
-    this->lsq_moments[moments_at(i, 0, 1)] = 0.3;
+    lsq_moments_h[moments_at(i, 0, 0)] = static_cast<TypeParam>(0.2);
+    lsq_moments_h[moments_at(i, 0, 1)] = static_cast<TypeParam>(0.3);
   }
 
+  // Copy to device
+  Kokkos::deep_copy(this->p_cc, p_cc_h);
+  Kokkos::deep_copy(this->cell_neighbor_idx, cell_neighbor_idx_h);
+  Kokkos::deep_copy(this->cell_neighbor_blk, cell_neighbor_blk_h);
+  Kokkos::deep_copy(this->lsq_pseudoinv, lsq_pseudoinv_h);
+  Kokkos::deep_copy(this->lsq_moments, lsq_moments_h);
+  Kokkos::deep_copy(this->p_coeff, p_coeff_h);
+
   // Test function
   recon_lsq_cell_l_svd<TypeParam>(
       this->p_cc.data(), this->cell_neighbor_idx.data(),
@@ -324,16 +410,19 @@ TYPED_TEST(HorizontalReconLinearTest, TestLsqCellSVD) {
       this->elev, this->nproma, this->l_consv, this->lacc, this->acc_async,
       this->nblks_c, this->nlev, this->lsq_dim_unk, this->lsq_dim_c);
 
+  // Copy results back to host
+  Kokkos::deep_copy(p_coeff_h, this->p_coeff);
+
   // Check result
   EXPECT_NEAR(
-      this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(0, 0, 0, 0))],
-      0.65, 1e-6);
+      p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(0, 0, 0, 0))],
+      static_cast<TypeParam>(0.65), 1e-6);
   EXPECT_NEAR(
-      this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(1, 0, 0, 0))],
-      1.0, 1e-6);
+      p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(1, 0, 0, 0))],
+      static_cast<TypeParam>(1.0), 1e-6);
   EXPECT_NEAR(
-      this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(2, 0, 0, 0))],
-      0.5, 1e-6);
+      p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(2, 0, 0, 0))],
+      static_cast<TypeParam>(0.5), 1e-6);
 }
 
 TYPED_TEST(HorizontalReconLinearTest, TestLsqCellSVDRandom) {
@@ -349,6 +438,16 @@ TYPED_TEST(HorizontalReconLinearTest, TestLsqCellSVDRandom) {
   const auto &p_coeff_at = at<lsq_dim_unk + 1, nproma, nlev, nblks_c>;
   const auto &moments_at = at<nproma, nblks_c, lsq_dim_unk>;
 
+  // Create host mirror views
+  auto p_cc_h = Kokkos::create_mirror_view(this->p_cc);
+  auto cell_neighbor_idx_h =
+      Kokkos::create_mirror_view(this->cell_neighbor_idx);
+  auto cell_neighbor_blk_h =
+      Kokkos::create_mirror_view(this->cell_neighbor_blk);
+  auto lsq_pseudoinv_h = Kokkos::create_mirror_view(this->lsq_pseudoinv);
+  auto lsq_moments_h = Kokkos::create_mirror_view(this->lsq_moments);
+  auto p_coeff_h = Kokkos::create_mirror_view(this->p_coeff);
+
   std::random_device rd;
   std::mt19937 gen(rd());
   std::uniform_int_distribution<int> int_distrib(0, nproma - 1);
@@ -356,22 +455,31 @@ TYPED_TEST(HorizontalReconLinearTest, TestLsqCellSVDRandom) {
 
   // Initialization
   for (int i = 0; i < nproma; ++i) {
-    this->p_cc[p_cc_at(i, 0, 0)] = real_distrib(gen);
+    p_cc_h[p_cc_at(i, 0, 0)] = real_distrib(gen);
 
     for (int j = 0; j < lsq_dim_c; ++j) {
-      this->cell_neighbor_idx[cell_neighbor_at(i, 0, j)] = int_distrib(gen);
-      this->cell_neighbor_blk[cell_neighbor_at(i, 0, j)] = 0;
-      this->lsq_pseudoinv[pseudoinv_at(i, 0, j, 0)] = real_distrib(gen);
-      this->lsq_pseudoinv[pseudoinv_at(i, 1, j, 0)] = real_distrib(gen);
+      cell_neighbor_idx_h[cell_neighbor_at(i, 0, j)] = int_distrib(gen);
+      cell_neighbor_blk_h[cell_neighbor_at(i, 0, j)] = 0;
+      lsq_pseudoinv_h[pseudoinv_at(i, 0, j, 0)] = real_distrib(gen);
+      lsq_pseudoinv_h[pseudoinv_at(i, 1, j, 0)] = real_distrib(gen);
     }
+
     for (int j = 0; j < lsq_dim_unk + 1; ++j) {
-      this->p_coeff[p_coeff_at(j, i, 0, 0)] = real_distrib(gen);
+      p_coeff_h[p_coeff_at(j, i, 0, 0)] = static_cast<TypeParam>(0.0);
     }
 
-    this->lsq_moments[moments_at(i, 0, 0)] = real_distrib(gen);
-    this->lsq_moments[moments_at(i, 0, 1)] = real_distrib(gen);
+    lsq_moments_h[moments_at(i, 0, 0)] = real_distrib(gen);
+    lsq_moments_h[moments_at(i, 0, 1)] = real_distrib(gen);
   }
 
+  // Copy to device
+  Kokkos::deep_copy(this->p_cc, p_cc_h);
+  Kokkos::deep_copy(this->cell_neighbor_idx, cell_neighbor_idx_h);
+  Kokkos::deep_copy(this->cell_neighbor_blk, cell_neighbor_blk_h);
+  Kokkos::deep_copy(this->lsq_pseudoinv, lsq_pseudoinv_h);
+  Kokkos::deep_copy(this->lsq_moments, lsq_moments_h);
+  Kokkos::deep_copy(this->p_coeff, p_coeff_h);
+
   // Test function
   recon_lsq_cell_l_svd<TypeParam>(
       this->p_cc.data(), this->cell_neighbor_idx.data(),
@@ -381,6 +489,9 @@ TYPED_TEST(HorizontalReconLinearTest, TestLsqCellSVDRandom) {
       this->elev, this->nproma, this->l_consv, this->lacc, this->acc_async,
       this->nblks_c, this->nlev, this->lsq_dim_unk, this->lsq_dim_c);
 
+  // Copy results back to host
+  Kokkos::deep_copy(p_coeff_h, this->p_coeff);
+
   // Compute reference result
   std::vector<TypeParam> z_d(lsq_dim_c);
   std::vector<TypeParam> p_result((lsq_dim_unk + 1) * nproma);
@@ -391,26 +502,26 @@ TYPED_TEST(HorizontalReconLinearTest, TestLsqCellSVDRandom) {
     for (int jk = this->slev; jk < this->elev; ++jk) {
       for (int jc = i_startidx; jc < i_endidx; ++jc) {
         for (int i = 0; i < lsq_dim_c; ++i) {
-          z_d[i] = this->p_cc[p_cc_at(
-                       this->cell_neighbor_idx[cell_neighbor_at(jc, jb, i)], jk,
-                       this->cell_neighbor_blk[cell_neighbor_at(jc, jb, i)])] -
-                   this->p_cc[p_cc_at(jc, jk, jb)];
+          z_d[i] = p_cc_h[p_cc_at(
+                       cell_neighbor_idx_h[cell_neighbor_at(jc, jb, i)], jk,
+                       cell_neighbor_blk_h[cell_neighbor_at(jc, jb, i)])] -
+                   p_cc_h[p_cc_at(jc, jk, jb)];
         }
         p_result[at<lsq_dim_unk + 1, nproma>(2, jc)] =
-            this->lsq_pseudoinv[pseudoinv_at(jc, 1, 0, jb)] * z_d[0] +
-            this->lsq_pseudoinv[pseudoinv_at(jc, 1, 1, jb)] * z_d[1] +
-            this->lsq_pseudoinv[pseudoinv_at(jc, 1, 2, jb)] * z_d[2];
+            lsq_pseudoinv_h[pseudoinv_at(jc, 1, 0, jb)] * z_d[0] +
+            lsq_pseudoinv_h[pseudoinv_at(jc, 1, 1, jb)] * z_d[1] +
+            lsq_pseudoinv_h[pseudoinv_at(jc, 1, 2, jb)] * z_d[2];
         p_result[at<lsq_dim_unk + 1, nproma>(1, jc)] =
-            this->lsq_pseudoinv[pseudoinv_at(jc, 0, 0, jb)] * z_d[0] +
-            this->lsq_pseudoinv[pseudoinv_at(jc, 0, 1, jb)] * z_d[1] +
-            this->lsq_pseudoinv[pseudoinv_at(jc, 0, 2, jb)] * z_d[2];
+            lsq_pseudoinv_h[pseudoinv_at(jc, 0, 0, jb)] * z_d[0] +
+            lsq_pseudoinv_h[pseudoinv_at(jc, 0, 1, jb)] * z_d[1] +
+            lsq_pseudoinv_h[pseudoinv_at(jc, 0, 2, jb)] * z_d[2];
         p_result[at<lsq_dim_unk + 1, nproma>(0, jc)] =
             p_result[at<lsq_dim_unk + 1, nproma>(0, jc)] =
-                this->p_cc[p_cc_at(jc, jk, jb)] -
+                p_cc_h[p_cc_at(jc, jk, jb)] -
                 p_result[at<lsq_dim_unk + 1, nproma>(1, jc)] *
-                    this->lsq_moments[moments_at(jc, jb, 0)] -
+                    lsq_moments_h[moments_at(jc, jb, 0)] -
                 p_result[at<lsq_dim_unk + 1, nproma>(2, jc)] *
-                    this->lsq_moments[moments_at(jc, jb, 1)];
+                    lsq_moments_h[moments_at(jc, jb, 1)];
       }
     }
   }
@@ -418,7 +529,7 @@ TYPED_TEST(HorizontalReconLinearTest, TestLsqCellSVDRandom) {
   // Check result
   for (int i = 0; i < lsq_dim_unk + 1; ++i) {
     for (int jc = 0; jc < nproma; ++jc) {
-      EXPECT_NEAR(this->p_coeff[(p_coeff_at(i, jc, 0, 0))],
+      EXPECT_NEAR(p_coeff_h[(p_coeff_at(i, jc, 0, 0))],
                   p_result[(at<lsq_dim_unk + 1, nproma>(i, jc))], 1e-5)
           << "For loop result fails for i = " << i << ", jc = " << jc;
     }
@@ -443,43 +554,65 @@ TYPED_TEST(HorizontalReconQuadraticTest, TestLsqCell) {
       at<nproma, (lsq_dim_unk * lsq_dim_unk - lsq_dim_unk) / 2, nblks_c>;
   const auto &moments_at = at<nproma, nblks_c, lsq_dim_unk>;
 
+  // Create host mirror views
+  auto p_cc_h = Kokkos::create_mirror_view(this->p_cc);
+  auto cell_neighbor_idx_h =
+      Kokkos::create_mirror_view(this->cell_neighbor_idx);
+  auto cell_neighbor_blk_h =
+      Kokkos::create_mirror_view(this->cell_neighbor_blk);
+  auto lsq_qtmat_c_h = Kokkos::create_mirror_view(this->lsq_qtmat_c);
+  auto lsq_rmat_rdiag_c_h = Kokkos::create_mirror_view(this->lsq_rmat_rdiag_c);
+  auto lsq_rmat_utri_c_h = Kokkos::create_mirror_view(this->lsq_rmat_utri_c);
+  auto lsq_moments_h = Kokkos::create_mirror_view(this->lsq_moments);
+  auto p_coeff_h = Kokkos::create_mirror_view(this->p_coeff);
+
   // Initialization
   for (int i = 0; i < nproma; ++i) {
-    this->p_cc[p_cc_at(i, 0, 0)] = (i + 1);
+    p_cc_h[p_cc_at(i, 0, 0)] = static_cast<TypeParam>(i + 1);
 
-    this->cell_neighbor_idx[cell_neighbor_at(i, 0, 0)] = (i + 1) % nproma;
-    this->cell_neighbor_blk[cell_neighbor_at(i, 0, 0)] = 0;
+    cell_neighbor_idx_h[cell_neighbor_at(i, 0, 0)] = (i + 1) % nproma;
+    cell_neighbor_blk_h[cell_neighbor_at(i, 0, 0)] = 0;
     for (int j = 1; j < lsq_dim_c; ++j) {
-      this->cell_neighbor_idx[cell_neighbor_at(i, 0, j)] = i;
-      this->cell_neighbor_blk[cell_neighbor_at(i, 0, j)] = 0;
+      cell_neighbor_idx_h[cell_neighbor_at(i, 0, j)] = i;
+      cell_neighbor_blk_h[cell_neighbor_at(i, 0, j)] = 0;
     }
 
     for (int j = 0; j < lsq_dim_c; ++j) {
-      this->lsq_qtmat_c[qtmat_at(i, 0, j, 0)] = 1.0;
-      this->lsq_qtmat_c[qtmat_at(i, 1, j, 0)] = 0.5;
-      this->lsq_qtmat_c[qtmat_at(i, 2, j, 0)] = 0.2;
-      this->lsq_qtmat_c[qtmat_at(i, 3, j, 0)] = 0.7;
-      this->lsq_qtmat_c[qtmat_at(i, 4, j, 0)] = 1.3;
+      lsq_qtmat_c_h[qtmat_at(i, 0, j, 0)] = static_cast<TypeParam>(1.0);
+      lsq_qtmat_c_h[qtmat_at(i, 1, j, 0)] = static_cast<TypeParam>(0.5);
+      lsq_qtmat_c_h[qtmat_at(i, 2, j, 0)] = static_cast<TypeParam>(0.2);
+      lsq_qtmat_c_h[qtmat_at(i, 3, j, 0)] = static_cast<TypeParam>(0.7);
+      lsq_qtmat_c_h[qtmat_at(i, 4, j, 0)] = static_cast<TypeParam>(1.3);
     }
     for (int j = 0; j < lsq_dim_unk + 1; ++j) {
-      this->p_coeff[p_coeff_at(j, i, 0, 0)] = 0.0;
+      p_coeff_h[p_coeff_at(j, i, 0, 0)] = static_cast<TypeParam>(0.0);
     }
 
     for (int j = 0; j < lsq_dim_unk; ++j) {
-      this->lsq_rmat_rdiag_c[rmat_rdiag_at(i, j, 0)] = 2.0;
+      lsq_rmat_rdiag_c_h[rmat_rdiag_at(i, j, 0)] = static_cast<TypeParam>(2.0);
     }
 
     for (int j = 0; j < (lsq_dim_unk * lsq_dim_unk - lsq_dim_unk) / 2; ++j) {
-      this->lsq_rmat_utri_c[rmat_utri_at(i, j, 0)] = 1.0;
+      lsq_rmat_utri_c_h[rmat_utri_at(i, j, 0)] = static_cast<TypeParam>(1.0);
     }
 
-    this->lsq_moments[moments_at(i, 0, 0)] = 0.2;
-    this->lsq_moments[moments_at(i, 0, 1)] = 0.3;
-    this->lsq_moments[moments_at(i, 0, 2)] = 0.4;
-    this->lsq_moments[moments_at(i, 0, 3)] = 0.5;
-    this->lsq_moments[moments_at(i, 0, 4)] = 0.6;
+    lsq_moments_h[moments_at(i, 0, 0)] = static_cast<TypeParam>(0.2);
+    lsq_moments_h[moments_at(i, 0, 1)] = static_cast<TypeParam>(0.3);
+    lsq_moments_h[moments_at(i, 0, 2)] = static_cast<TypeParam>(0.4);
+    lsq_moments_h[moments_at(i, 0, 3)] = static_cast<TypeParam>(0.5);
+    lsq_moments_h[moments_at(i, 0, 4)] = static_cast<TypeParam>(0.6);
   }
 
+  // Copy to device
+  Kokkos::deep_copy(this->p_cc, p_cc_h);
+  Kokkos::deep_copy(this->cell_neighbor_idx, cell_neighbor_idx_h);
+  Kokkos::deep_copy(this->cell_neighbor_blk, cell_neighbor_blk_h);
+  Kokkos::deep_copy(this->lsq_qtmat_c, lsq_qtmat_c_h);
+  Kokkos::deep_copy(this->lsq_rmat_rdiag_c, lsq_rmat_rdiag_c_h);
+  Kokkos::deep_copy(this->lsq_rmat_utri_c, lsq_rmat_utri_c_h);
+  Kokkos::deep_copy(this->lsq_moments, lsq_moments_h);
+  Kokkos::deep_copy(this->p_coeff, p_coeff_h);
+
   // Test function
   recon_lsq_cell_q<TypeParam>(
       this->p_cc.data(), this->cell_neighbor_idx.data(),
@@ -491,25 +624,28 @@ TYPED_TEST(HorizontalReconQuadraticTest, TestLsqCell) {
       this->lacc, this->nblks_c, this->nlev, this->lsq_dim_unk,
       this->lsq_dim_c);
 
+  // Copy results back to host
+  Kokkos::deep_copy(p_coeff_h, this->p_coeff);
+
   // Check result
   EXPECT_NEAR(
-      this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(0, 0, 0, 0))],
-      0.24, 1e-6);
+      p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(0, 0, 0, 0))],
+      static_cast<TypeParam>(0.24), 1e-6);
   EXPECT_NEAR(
-      this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(1, 0, 0, 0))],
-      3.2, 1e-6);
+      p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(1, 0, 0, 0))],
+      static_cast<TypeParam>(3.2), 1e-6);
   EXPECT_NEAR(
-      this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(2, 0, 0, 0))],
-      -2.2, 1e-6);
+      p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(2, 0, 0, 0))],
+      static_cast<TypeParam>(-2.2), 1e-6);
   EXPECT_NEAR(
-      this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(3, 0, 0, 0))],
-      2.8, 1e-6);
+      p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(3, 0, 0, 0))],
+      static_cast<TypeParam>(2.8), 1e-6);
   EXPECT_NEAR(
-      this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(4, 0, 0, 0))],
-      -3.8, 1e-6);
+      p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(4, 0, 0, 0))],
+      static_cast<TypeParam>(-3.8), 1e-6);
   EXPECT_NEAR(
-      this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(5, 0, 0, 0))],
-      2.6, 1e-6);
+      p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(5, 0, 0, 0))],
+      static_cast<TypeParam>(2.6), 1e-6);
 }
 
 TYPED_TEST(HorizontalReconQuadraticTest, TestLsqCellRandom) {
@@ -528,6 +664,19 @@ TYPED_TEST(HorizontalReconQuadraticTest, TestLsqCellRandom) {
       at<nproma, (lsq_dim_unk * lsq_dim_unk - lsq_dim_unk) / 2, nblks_c>;
   const auto &moments_at = at<nproma, nblks_c, lsq_dim_unk>;
 
+  // Create host mirror views
+  auto p_cc_h = Kokkos::create_mirror_view(this->p_cc);
+  auto cell_neighbor_idx_h =
+      Kokkos::create_mirror_view(this->cell_neighbor_idx);
+  auto cell_neighbor_blk_h =
+      Kokkos::create_mirror_view(this->cell_neighbor_blk);
+  auto lsq_qtmat_c_h = Kokkos::create_mirror_view(this->lsq_qtmat_c);
+  auto lsq_rmat_rdiag_c_h = Kokkos::create_mirror_view(this->lsq_rmat_rdiag_c);
+  auto lsq_rmat_utri_c_h = Kokkos::create_mirror_view(this->lsq_rmat_utri_c);
+  auto lsq_moments_h = Kokkos::create_mirror_view(this->lsq_moments);
+  auto p_coeff_h = Kokkos::create_mirror_view(this->p_coeff);
+
+  // Use fixed seed for reproducibility
   std::random_device rd;
   std::mt19937 gen(rd());
   std::uniform_int_distribution<int> int_distrib(0, nproma - 1);
@@ -535,29 +684,39 @@ TYPED_TEST(HorizontalReconQuadraticTest, TestLsqCellRandom) {
 
   // Initialization
   for (int i = 0; i < nproma; ++i) {
-    this->p_cc[p_cc_at(i, 0, 0)] = real_distrib(gen);
+    p_cc_h[p_cc_at(i, 0, 0)] = real_distrib(gen);
 
     for (int j = 0; j < lsq_dim_c; ++j) {
-      this->cell_neighbor_idx[cell_neighbor_at(i, 0, j)] = int_distrib(gen);
-      this->cell_neighbor_blk[cell_neighbor_at(i, 0, j)] = 0;
+      cell_neighbor_idx_h[cell_neighbor_at(i, 0, j)] = int_distrib(gen);
+      cell_neighbor_blk_h[cell_neighbor_at(i, 0, j)] = 0;
     }
 
     for (int j = 0; j < lsq_dim_unk; ++j) {
       for (int k = 0; k < lsq_dim_c; ++k) {
-        this->lsq_qtmat_c[qtmat_at(i, j, k, 0)] = real_distrib(gen);
+        lsq_qtmat_c_h[qtmat_at(i, j, k, 0)] = real_distrib(gen);
       }
-      this->lsq_rmat_rdiag_c[rmat_rdiag_at(i, j, 0)] = real_distrib(gen);
-      this->lsq_moments[moments_at(i, 0, j)] = real_distrib(gen);
+      lsq_rmat_rdiag_c_h[rmat_rdiag_at(i, j, 0)] = real_distrib(gen);
+      lsq_moments_h[moments_at(i, 0, j)] = real_distrib(gen);
     }
     for (int j = 0; j < (lsq_dim_unk * lsq_dim_unk - lsq_dim_unk) / 2; ++j) {
-      this->lsq_rmat_utri_c[rmat_utri_at(i, j, 0)] = real_distrib(gen);
+      lsq_rmat_utri_c_h[rmat_utri_at(i, j, 0)] = real_distrib(gen);
     }
 
     for (int j = 0; j < lsq_dim_unk + 1; ++j) {
-      this->p_coeff[p_coeff_at(j, i, 0, 0)] = real_distrib(gen);
+      p_coeff_h[p_coeff_at(j, i, 0, 0)] = static_cast<TypeParam>(0.0);
     }
   }
 
+  // Copy to device
+  Kokkos::deep_copy(this->p_cc, p_cc_h);
+  Kokkos::deep_copy(this->cell_neighbor_idx, cell_neighbor_idx_h);
+  Kokkos::deep_copy(this->cell_neighbor_blk, cell_neighbor_blk_h);
+  Kokkos::deep_copy(this->lsq_qtmat_c, lsq_qtmat_c_h);
+  Kokkos::deep_copy(this->lsq_rmat_rdiag_c, lsq_rmat_rdiag_c_h);
+  Kokkos::deep_copy(this->lsq_rmat_utri_c, lsq_rmat_utri_c_h);
+  Kokkos::deep_copy(this->lsq_moments, lsq_moments_h);
+  Kokkos::deep_copy(this->p_coeff, p_coeff_h);
+
   // Test function
   recon_lsq_cell_q<TypeParam>(
       this->p_cc.data(), this->cell_neighbor_idx.data(),
@@ -569,46 +728,89 @@ TYPED_TEST(HorizontalReconQuadraticTest, TestLsqCellRandom) {
       this->lacc, this->nblks_c, this->nlev, this->lsq_dim_unk,
       this->lsq_dim_c);
 
-  // Compute reference result
-  std::vector<TypeParam> z_d(lsq_dim_c);
-  std::vector<TypeParam> z_qt_times_d(lsq_dim_unk);
-  std::vector<TypeParam> p_result((lsq_dim_unk + 1) * nproma);
+  // Copy results back to host
+  Kokkos::deep_copy(p_coeff_h, this->p_coeff);
+
+  // Create host views for reference computation
+  using host_space = Kokkos::HostSpace;
+
+  // Arrays for intermediate calculations
+  Kokkos::View<TypeParam ***, host_space> z_d_h("z_d_h", lsq_dim_c, nproma,
+                                                nlev);
+  Kokkos::View<TypeParam *, host_space> z_qt_times_d_h("z_qt_times_d_h",
+                                                       lsq_dim_unk);
+
+  // Result view
+  Kokkos::View<TypeParam **, host_space> p_result_h("p_result_h",
+                                                    lsq_dim_unk + 1, nproma);
+
   for (int jb = this->i_startblk; jb < this->i_endblk; ++jb) {
     int i_startidx, i_endidx;
     get_indices_c_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb,
                       this->i_startblk, this->i_endblk, i_startidx, i_endidx);
+
+    // Step 1: Calculate z_d values (matches the "recon_lsq_cell_q_step1"
+    // parallel_for)
     for (int jk = this->slev; jk < this->elev; ++jk) {
       for (int jc = i_startidx; jc < i_endidx; ++jc) {
         for (int i = 0; i < lsq_dim_c; ++i) {
-          z_d[i] = this->p_cc[p_cc_at(
-                       this->cell_neighbor_idx[cell_neighbor_at(jc, jb, i)], jk,
-                       this->cell_neighbor_blk[cell_neighbor_at(jc, jb, i)])] -
-                   this->p_cc[p_cc_at(jc, jk, jb)];
+          z_d_h(i, jc, jk) =
+              p_cc_h[p_cc_at(
+                  cell_neighbor_idx_h[cell_neighbor_at(jc, jb, i)], jk,
+                  cell_neighbor_blk_h[cell_neighbor_at(jc, jb, i)])] -
+              p_cc_h[p_cc_at(jc, jk, jb)];
         }
+      }
+    }
+
+    // Step 2: Calculate coefficients (matches the "recon_lsq_cell_q_step2"
+    // parallel_for)
+    for (int jk = this->slev; jk < this->elev; ++jk) {
+      for (int jc = i_startidx; jc < i_endidx; ++jc) {
+        // Matrix multiplication (Q^T * d)
         for (int j = 0; j < lsq_dim_unk; ++j) {
-          z_qt_times_d[j] = 0.0;
+          z_qt_times_d_h(j) = 0.0;
           for (int i = 0; i < lsq_dim_c; ++i) {
-            z_qt_times_d[j] +=
-                this->lsq_qtmat_c[qtmat_at(jc, j, i, jb)] * z_d[i];
+            z_qt_times_d_h(j) +=
+                lsq_qtmat_c_h[qtmat_at(jc, j, i, jb)] * z_d_h(i, jc, jk);
           }
         }
-        int utri_id = 0;
-        for (int j = lsq_dim_unk; j > 0; --j) {
-          p_result[at<lsq_dim_unk + 1, nproma>(j, jc)] = z_qt_times_d[j - 1];
-          for (int k = j + 1; k <= lsq_dim_unk; ++k) {
-            p_result[at<lsq_dim_unk + 1, nproma>(j, jc)] -=
-                this->lsq_rmat_utri_c[rmat_utri_at(jc, utri_id++, jb)] *
-                p_result[at<lsq_dim_unk + 1, nproma>(k, jc)];
-          }
-          p_result[at<lsq_dim_unk + 1, nproma>(j, jc)] *=
-              this->lsq_rmat_rdiag_c[rmat_rdiag_at(jc, j - 1, jb)];
-        }
-        p_result[at<lsq_dim_unk + 1, nproma>(0, jc)] =
-            this->p_cc[p_cc_at(jc, jk, jb)];
+
+        // Back-substitution (mirrors the order in the GPU implementation)
+        p_result_h(5, jc) =
+            lsq_rmat_rdiag_c_h[rmat_rdiag_at(jc, 4, jb)] * z_qt_times_d_h(4);
+
+        p_result_h(4, jc) =
+            lsq_rmat_rdiag_c_h[rmat_rdiag_at(jc, 3, jb)] *
+            (z_qt_times_d_h(3) -
+             lsq_rmat_utri_c_h[rmat_utri_at(jc, 0, jb)] * p_result_h(5, jc));
+
+        p_result_h(3, jc) =
+            lsq_rmat_rdiag_c_h[rmat_rdiag_at(jc, 2, jb)] *
+            (z_qt_times_d_h(2) -
+             lsq_rmat_utri_c_h[rmat_utri_at(jc, 1, jb)] * p_result_h(4, jc) -
+             lsq_rmat_utri_c_h[rmat_utri_at(jc, 2, jb)] * p_result_h(5, jc));
+
+        p_result_h(2, jc) =
+            lsq_rmat_rdiag_c_h[rmat_rdiag_at(jc, 1, jb)] *
+            (z_qt_times_d_h(1) -
+             lsq_rmat_utri_c_h[rmat_utri_at(jc, 3, jb)] * p_result_h(3, jc) -
+             lsq_rmat_utri_c_h[rmat_utri_at(jc, 4, jb)] * p_result_h(4, jc) -
+             lsq_rmat_utri_c_h[rmat_utri_at(jc, 5, jb)] * p_result_h(5, jc));
+
+        p_result_h(1, jc) =
+            lsq_rmat_rdiag_c_h[rmat_rdiag_at(jc, 0, jb)] *
+            (z_qt_times_d_h(0) -
+             lsq_rmat_utri_c_h[rmat_utri_at(jc, 6, jb)] * p_result_h(2, jc) -
+             lsq_rmat_utri_c_h[rmat_utri_at(jc, 7, jb)] * p_result_h(3, jc) -
+             lsq_rmat_utri_c_h[rmat_utri_at(jc, 8, jb)] * p_result_h(4, jc) -
+             lsq_rmat_utri_c_h[rmat_utri_at(jc, 9, jb)] * p_result_h(5, jc));
+
+        // Conservation correction
+        p_result_h(0, jc) = p_cc_h[p_cc_at(jc, jk, jb)];
         for (int j = 0; j < lsq_dim_unk; ++j) {
-          p_result[at<lsq_dim_unk + 1, nproma>(0, jc)] -=
-              p_result[at<lsq_dim_unk + 1, nproma>(j + 1, jc)] *
-              this->lsq_moments[moments_at(jc, jb, j)];
+          p_result_h(0, jc) -=
+              p_result_h(j + 1, jc) * lsq_moments_h[moments_at(jc, jb, j)];
         }
       }
     }
@@ -617,8 +819,7 @@ TYPED_TEST(HorizontalReconQuadraticTest, TestLsqCellRandom) {
   // Check result
   for (int i = 0; i < lsq_dim_unk + 1; ++i) {
     for (int jc = 0; jc < nproma; ++jc) {
-      EXPECT_NEAR(this->p_coeff[(p_coeff_at(i, jc, 0, 0))],
-                  p_result[(at<lsq_dim_unk + 1, nproma>(i, jc))], 1e-5)
+      EXPECT_NEAR(p_coeff_h[p_coeff_at(i, jc, 0, 0)], p_result_h(i, jc), 1e-5)
           << "For loop result fails for i = " << i << ", jc = " << jc;
     }
   }
@@ -637,35 +838,53 @@ TYPED_TEST(HorizontalReconQuadraticTest, TestLsqCellSVD) {
   const auto &p_coeff_at = at<lsq_dim_unk + 1, nproma, nlev, nblks_c>;
   const auto &moments_at = at<nproma, nblks_c, lsq_dim_unk>;
 
+  // Create host mirror views
+  auto p_cc_h = Kokkos::create_mirror_view(this->p_cc);
+  auto cell_neighbor_idx_h =
+      Kokkos::create_mirror_view(this->cell_neighbor_idx);
+  auto cell_neighbor_blk_h =
+      Kokkos::create_mirror_view(this->cell_neighbor_blk);
+  auto lsq_pseudoinv_h = Kokkos::create_mirror_view(this->lsq_pseudoinv);
+  auto lsq_moments_h = Kokkos::create_mirror_view(this->lsq_moments);
+  auto p_coeff_h = Kokkos::create_mirror_view(this->p_coeff);
+
   // Initialization
   for (int i = 0; i < nproma; ++i) {
-    this->p_cc[p_cc_at(i, 0, 0)] = (i + 1);
+    p_cc_h[p_cc_at(i, 0, 0)] = static_cast<TypeParam>(i + 1);
 
-    this->cell_neighbor_idx[cell_neighbor_at(i, 0, 0)] = (i + 1) % nproma;
-    this->cell_neighbor_blk[cell_neighbor_at(i, 0, 0)] = 0;
+    cell_neighbor_idx_h[cell_neighbor_at(i, 0, 0)] = (i + 1) % nproma;
+    cell_neighbor_blk_h[cell_neighbor_at(i, 0, 0)] = 0;
     for (int j = 1; j < lsq_dim_c; ++j) {
-      this->cell_neighbor_idx[cell_neighbor_at(i, 0, j)] = i;
-      this->cell_neighbor_blk[cell_neighbor_at(i, 0, j)] = 0;
+      cell_neighbor_idx_h[cell_neighbor_at(i, 0, j)] = i;
+      cell_neighbor_blk_h[cell_neighbor_at(i, 0, j)] = 0;
     }
 
     for (int j = 0; j < lsq_dim_c; ++j) {
-      this->lsq_pseudoinv[pseudoinv_at(i, 0, j, 0)] = 1.0;
-      this->lsq_pseudoinv[pseudoinv_at(i, 1, j, 0)] = 0.5;
-      this->lsq_pseudoinv[pseudoinv_at(i, 2, j, 0)] = 0.2;
-      this->lsq_pseudoinv[pseudoinv_at(i, 3, j, 0)] = 0.7;
-      this->lsq_pseudoinv[pseudoinv_at(i, 4, j, 0)] = 1.3;
+      lsq_pseudoinv_h[pseudoinv_at(i, 0, j, 0)] = static_cast<TypeParam>(1.0);
+      lsq_pseudoinv_h[pseudoinv_at(i, 1, j, 0)] = static_cast<TypeParam>(0.5);
+      lsq_pseudoinv_h[pseudoinv_at(i, 2, j, 0)] = static_cast<TypeParam>(0.2);
+      lsq_pseudoinv_h[pseudoinv_at(i, 3, j, 0)] = static_cast<TypeParam>(0.7);
+      lsq_pseudoinv_h[pseudoinv_at(i, 4, j, 0)] = static_cast<TypeParam>(1.3);
     }
     for (int j = 0; j < lsq_dim_unk + 1; ++j) {
-      this->p_coeff[p_coeff_at(j, i, 0, 0)] = 0.0;
+      p_coeff_h[p_coeff_at(j, i, 0, 0)] = static_cast<TypeParam>(0.0);
     }
 
-    this->lsq_moments[moments_at(i, 0, 0)] = 0.2;
-    this->lsq_moments[moments_at(i, 0, 1)] = 0.3;
-    this->lsq_moments[moments_at(i, 0, 2)] = 0.4;
-    this->lsq_moments[moments_at(i, 0, 3)] = 0.5;
-    this->lsq_moments[moments_at(i, 0, 4)] = 0.6;
+    lsq_moments_h[moments_at(i, 0, 0)] = static_cast<TypeParam>(0.2);
+    lsq_moments_h[moments_at(i, 0, 1)] = static_cast<TypeParam>(0.3);
+    lsq_moments_h[moments_at(i, 0, 2)] = static_cast<TypeParam>(0.4);
+    lsq_moments_h[moments_at(i, 0, 3)] = static_cast<TypeParam>(0.5);
+    lsq_moments_h[moments_at(i, 0, 4)] = static_cast<TypeParam>(0.6);
   }
 
+  // Copy to device
+  Kokkos::deep_copy(this->p_cc, p_cc_h);
+  Kokkos::deep_copy(this->cell_neighbor_idx, cell_neighbor_idx_h);
+  Kokkos::deep_copy(this->cell_neighbor_blk, cell_neighbor_blk_h);
+  Kokkos::deep_copy(this->lsq_pseudoinv, lsq_pseudoinv_h);
+  Kokkos::deep_copy(this->lsq_moments, lsq_moments_h);
+  Kokkos::deep_copy(this->p_coeff, p_coeff_h);
+
   // Test function
   recon_lsq_cell_q_svd<TypeParam>(
       this->p_cc.data(), this->cell_neighbor_idx.data(),
@@ -676,25 +895,28 @@ TYPED_TEST(HorizontalReconQuadraticTest, TestLsqCellSVD) {
       this->lacc, this->nblks_c, this->nlev, this->lsq_dim_unk,
       this->lsq_dim_c);
 
+  // Copy results back to host
+  Kokkos::deep_copy(p_coeff_h, this->p_coeff);
+
   // Check result
   EXPECT_NEAR(
-      this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(0, 0, 0, 0))],
-      -0.56, 1e-6);
+      p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(0, 0, 0, 0))],
+      static_cast<TypeParam>(-0.56), 1e-6);
   EXPECT_NEAR(
-      this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(1, 0, 0, 0))],
-      1.0, 1e-6);
+      p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(1, 0, 0, 0))],
+      static_cast<TypeParam>(1.0), 1e-6);
   EXPECT_NEAR(
-      this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(2, 0, 0, 0))],
-      0.5, 1e-6);
+      p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(2, 0, 0, 0))],
+      static_cast<TypeParam>(0.5), 1e-6);
   EXPECT_NEAR(
-      this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(3, 0, 0, 0))],
-      0.2, 1e-6);
+      p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(3, 0, 0, 0))],
+      static_cast<TypeParam>(0.2), 1e-6);
   EXPECT_NEAR(
-      this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(4, 0, 0, 0))],
-      0.7, 1e-6);
+      p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(4, 0, 0, 0))],
+      static_cast<TypeParam>(0.7), 1e-6);
   EXPECT_NEAR(
-      this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(5, 0, 0, 0))],
-      1.3, 1e-6);
+      p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(5, 0, 0, 0))],
+      static_cast<TypeParam>(1.3), 1e-6);
 }
 
 TYPED_TEST(HorizontalReconQuadraticTest, TestLsqCellSVDRandom) {
@@ -713,32 +935,51 @@ TYPED_TEST(HorizontalReconQuadraticTest, TestLsqCellSVDRandom) {
       at<nproma, (lsq_dim_unk * lsq_dim_unk - lsq_dim_unk) / 2, nblks_c>;
   const auto &moments_at = at<nproma, nblks_c, lsq_dim_unk>;
 
+  // Create host mirror views
+  auto p_cc_h = Kokkos::create_mirror_view(this->p_cc);
+  auto cell_neighbor_idx_h =
+      Kokkos::create_mirror_view(this->cell_neighbor_idx);
+  auto cell_neighbor_blk_h =
+      Kokkos::create_mirror_view(this->cell_neighbor_blk);
+  auto lsq_pseudoinv_h = Kokkos::create_mirror_view(this->lsq_pseudoinv);
+  auto lsq_moments_h = Kokkos::create_mirror_view(this->lsq_moments);
+  auto p_coeff_h = Kokkos::create_mirror_view(this->p_coeff);
+
+  // Use fixed seed for reproducibility
   std::random_device rd;
   std::mt19937 gen(rd());
   std::uniform_int_distribution<int> int_distrib(0, nproma - 1);
   std::uniform_real_distribution<TypeParam> real_distrib(0.0, 1.0);
 
-  // Initialization is done only for iblk = 0 and ilev = 0
+  // Initialization
   for (int i = 0; i < nproma; ++i) {
-    this->p_cc[p_cc_at(i, 0, 0)] = real_distrib(gen);
+    p_cc_h[p_cc_at(i, 0, 0)] = real_distrib(gen);
 
     for (int j = 0; j < lsq_dim_c; ++j) {
-      this->cell_neighbor_idx[cell_neighbor_at(i, 0, j)] = int_distrib(gen);
-      this->cell_neighbor_blk[cell_neighbor_at(i, 0, j)] = 0;
+      cell_neighbor_idx_h[cell_neighbor_at(i, 0, j)] = int_distrib(gen);
+      cell_neighbor_blk_h[cell_neighbor_at(i, 0, j)] = 0;
     }
 
     for (int j = 0; j < lsq_dim_unk; ++j) {
       for (int k = 0; k < lsq_dim_c; ++k) {
-        this->lsq_pseudoinv[pseudoinv_at(i, j, k, 0)] = real_distrib(gen);
+        lsq_pseudoinv_h[pseudoinv_at(i, j, k, 0)] = real_distrib(gen);
       }
-      this->lsq_moments[moments_at(i, 0, j)] = real_distrib(gen);
+      lsq_moments_h[moments_at(i, 0, j)] = real_distrib(gen);
     }
 
     for (int j = 0; j < lsq_dim_unk + 1; ++j) {
-      this->p_coeff[p_coeff_at(j, i, 0, 0)] = real_distrib(gen);
+      p_coeff_h[p_coeff_at(j, i, 0, 0)] = static_cast<TypeParam>(0.0);
     }
   }
 
+  // Copy to device
+  Kokkos::deep_copy(this->p_cc, p_cc_h);
+  Kokkos::deep_copy(this->cell_neighbor_idx, cell_neighbor_idx_h);
+  Kokkos::deep_copy(this->cell_neighbor_blk, cell_neighbor_blk_h);
+  Kokkos::deep_copy(this->lsq_pseudoinv, lsq_pseudoinv_h);
+  Kokkos::deep_copy(this->lsq_moments, lsq_moments_h);
+  Kokkos::deep_copy(this->p_coeff, p_coeff_h);
+
   // Test function
   recon_lsq_cell_q_svd<TypeParam>(
       this->p_cc.data(), this->cell_neighbor_idx.data(),
@@ -749,9 +990,11 @@ TYPED_TEST(HorizontalReconQuadraticTest, TestLsqCellSVDRandom) {
       this->lacc, this->nblks_c, this->nlev, this->lsq_dim_unk,
       this->lsq_dim_c);
 
+  // Copy results back to host
+  Kokkos::deep_copy(p_coeff_h, this->p_coeff);
+
   // Compute reference result
   std::vector<TypeParam> z_d(lsq_dim_c);
-  std::vector<TypeParam> z_qt_times_d(lsq_dim_unk);
   std::vector<TypeParam> p_result((lsq_dim_unk + 1) * nproma);
   for (int jb = this->i_startblk; jb < this->i_endblk; ++jb) {
     int i_startidx, i_endidx;
@@ -760,24 +1003,27 @@ TYPED_TEST(HorizontalReconQuadraticTest, TestLsqCellSVDRandom) {
     for (int jk = this->slev; jk < this->elev; ++jk) {
       for (int jc = i_startidx; jc < i_endidx; ++jc) {
         for (int i = 0; i < lsq_dim_c; ++i) {
-          z_d[i] = this->p_cc[p_cc_at(
-                       this->cell_neighbor_idx[cell_neighbor_at(jc, jb, i)], jk,
-                       this->cell_neighbor_blk[cell_neighbor_at(jc, jb, i)])] -
-                   this->p_cc[p_cc_at(jc, jk, jb)];
+          z_d[i] = p_cc_h[p_cc_at(
+                       cell_neighbor_idx_h[cell_neighbor_at(jc, jb, i)], jk,
+                       cell_neighbor_blk_h[cell_neighbor_at(jc, jb, i)])] -
+                   p_cc_h[p_cc_at(jc, jk, jb)];
         }
         for (int j = 1; j < lsq_dim_unk + 1; ++j) {
           p_result[at<lsq_dim_unk + 1, nproma>(j, jc)] = 0.0;
           for (int i = 0; i < lsq_dim_c; ++i) {
             p_result[at<lsq_dim_unk + 1, nproma>(j, jc)] +=
-                this->lsq_pseudoinv[pseudoinv_at(jc, j - 1, i, jb)] * z_d[i];
+                lsq_pseudoinv_h[pseudoinv_at(jc, j - 1, i, jb)] * z_d[i];
           }
+          // p_result[at<lsq_dim_unk + 1, nproma>(j, jc)] *=
+          //     lsq_moments_h[moments_at(jc, jb, j - 1)];
         }
         p_result[at<lsq_dim_unk + 1, nproma>(0, jc)] =
-            this->p_cc[p_cc_at(jc, jk, jb)];
-        for (int j = 0; j < lsq_dim_unk; ++j) {
+            p_cc_h[p_cc_at(jc, jk, jb)];
+        for (int j = 0; j < lsq_dim_unk + 1; ++j) {
           p_result[at<lsq_dim_unk + 1, nproma>(0, jc)] -=
+              // p_result[at<lsq_dim_unk + 1, nproma>(j, jc)];
               p_result[at<lsq_dim_unk + 1, nproma>(j + 1, jc)] *
-              this->lsq_moments[moments_at(jc, jb, j)];
+              lsq_moments_h[moments_at(jc, jb, j)];
         }
       }
     }
@@ -786,7 +1032,7 @@ TYPED_TEST(HorizontalReconQuadraticTest, TestLsqCellSVDRandom) {
   // Check result
   for (int j = 0; j < lsq_dim_unk + 1; ++j) {
     for (int jc = 0; jc < nproma; ++jc) {
-      EXPECT_NEAR(this->p_coeff[(p_coeff_at(j, jc, 0, 0))],
+      EXPECT_NEAR(p_coeff_h[(p_coeff_at(j, jc, 0, 0))],
                   p_result[(at<lsq_dim_unk + 1, nproma>(j, jc))], 1e-5)
           << "For loop result fails for j = " << j << ", jc = " << jc;
     }
@@ -811,51 +1057,73 @@ TYPED_TEST(HorizontalReconCubicTest, TestLsqCell) {
       at<nproma, (lsq_dim_unk * lsq_dim_unk - lsq_dim_unk) / 2, nblks_c>;
   const auto &moments_at = at<nproma, nblks_c, lsq_dim_unk>;
 
+  // Create host mirror views
+  auto p_cc_h = Kokkos::create_mirror_view(this->p_cc);
+  auto cell_neighbor_idx_h =
+      Kokkos::create_mirror_view(this->cell_neighbor_idx);
+  auto cell_neighbor_blk_h =
+      Kokkos::create_mirror_view(this->cell_neighbor_blk);
+  auto lsq_qtmat_c_h = Kokkos::create_mirror_view(this->lsq_qtmat_c);
+  auto lsq_rmat_rdiag_c_h = Kokkos::create_mirror_view(this->lsq_rmat_rdiag_c);
+  auto lsq_rmat_utri_c_h = Kokkos::create_mirror_view(this->lsq_rmat_utri_c);
+  auto lsq_moments_h = Kokkos::create_mirror_view(this->lsq_moments);
+  auto p_coeff_h = Kokkos::create_mirror_view(this->p_coeff);
+
   // Initialization
   for (int i = 0; i < nproma; ++i) {
-    this->p_cc[p_cc_at(i, 0, 0)] = (i + 1);
+    p_cc_h[p_cc_at(i, 0, 0)] = static_cast<TypeParam>(i + 1);
 
-    this->cell_neighbor_idx[cell_neighbor_at(i, 0, 0)] = (i + 1) % nproma;
-    this->cell_neighbor_blk[cell_neighbor_at(i, 0, 0)] = 0;
+    cell_neighbor_idx_h[cell_neighbor_at(i, 0, 0)] = (i + 1) % nproma;
+    cell_neighbor_blk_h[cell_neighbor_at(i, 0, 0)] = 0;
     for (int j = 1; j < lsq_dim_c; ++j) {
-      this->cell_neighbor_idx[cell_neighbor_at(i, 0, j)] = i;
-      this->cell_neighbor_blk[cell_neighbor_at(i, 0, j)] = 0;
+      cell_neighbor_idx_h[cell_neighbor_at(i, 0, j)] = i;
+      cell_neighbor_blk_h[cell_neighbor_at(i, 0, j)] = 0;
     }
 
     for (int j = 0; j < lsq_dim_c; ++j) {
-      this->lsq_qtmat_c[qtmat_at(i, 0, j, 0)] = 1.0;
-      this->lsq_qtmat_c[qtmat_at(i, 1, j, 0)] = 0.9;
-      this->lsq_qtmat_c[qtmat_at(i, 2, j, 0)] = 0.8;
-      this->lsq_qtmat_c[qtmat_at(i, 3, j, 0)] = 0.7;
-      this->lsq_qtmat_c[qtmat_at(i, 4, j, 0)] = 0.6;
-      this->lsq_qtmat_c[qtmat_at(i, 5, j, 0)] = 0.5;
-      this->lsq_qtmat_c[qtmat_at(i, 6, j, 0)] = 0.4;
-      this->lsq_qtmat_c[qtmat_at(i, 7, j, 0)] = 0.3;
-      this->lsq_qtmat_c[qtmat_at(i, 8, j, 0)] = 0.2;
+      lsq_qtmat_c_h[qtmat_at(i, 0, j, 0)] = static_cast<TypeParam>(1.0);
+      lsq_qtmat_c_h[qtmat_at(i, 1, j, 0)] = static_cast<TypeParam>(0.9);
+      lsq_qtmat_c_h[qtmat_at(i, 2, j, 0)] = static_cast<TypeParam>(0.8);
+      lsq_qtmat_c_h[qtmat_at(i, 3, j, 0)] = static_cast<TypeParam>(0.7);
+      lsq_qtmat_c_h[qtmat_at(i, 4, j, 0)] = static_cast<TypeParam>(0.6);
+      lsq_qtmat_c_h[qtmat_at(i, 5, j, 0)] = static_cast<TypeParam>(0.5);
+      lsq_qtmat_c_h[qtmat_at(i, 6, j, 0)] = static_cast<TypeParam>(0.4);
+      lsq_qtmat_c_h[qtmat_at(i, 7, j, 0)] = static_cast<TypeParam>(0.3);
+      lsq_qtmat_c_h[qtmat_at(i, 8, j, 0)] = static_cast<TypeParam>(0.2);
     }
     for (int j = 0; j < lsq_dim_unk + 1; ++j) {
-      this->p_coeff[p_coeff_at(j, i, 0, 0)] = 0.0;
+      p_coeff_h[p_coeff_at(j, i, 0, 0)] = static_cast<TypeParam>(0.0);
     }
 
     for (int j = 0; j < lsq_dim_unk; ++j) {
-      this->lsq_rmat_rdiag_c[rmat_rdiag_at(i, j, 0)] = 2.0;
+      lsq_rmat_rdiag_c_h[rmat_rdiag_at(i, j, 0)] = static_cast<TypeParam>(2.0);
     }
 
     for (int j = 0; j < (lsq_dim_unk * lsq_dim_unk - lsq_dim_unk) / 2; ++j) {
-      this->lsq_rmat_utri_c[rmat_utri_at(i, j, 0)] = 1.0;
+      lsq_rmat_utri_c_h[rmat_utri_at(i, j, 0)] = static_cast<TypeParam>(1.0);
     }
 
-    this->lsq_moments[moments_at(i, 0, 0)] = 0.2;
-    this->lsq_moments[moments_at(i, 0, 1)] = 0.3;
-    this->lsq_moments[moments_at(i, 0, 2)] = 0.4;
-    this->lsq_moments[moments_at(i, 0, 3)] = 0.5;
-    this->lsq_moments[moments_at(i, 0, 4)] = 0.6;
-    this->lsq_moments[moments_at(i, 0, 5)] = 0.7;
-    this->lsq_moments[moments_at(i, 0, 6)] = 0.8;
-    this->lsq_moments[moments_at(i, 0, 7)] = 0.9;
-    this->lsq_moments[moments_at(i, 0, 8)] = 1.0;
+    lsq_moments_h[moments_at(i, 0, 0)] = static_cast<TypeParam>(0.2);
+    lsq_moments_h[moments_at(i, 0, 1)] = static_cast<TypeParam>(0.3);
+    lsq_moments_h[moments_at(i, 0, 2)] = static_cast<TypeParam>(0.4);
+    lsq_moments_h[moments_at(i, 0, 3)] = static_cast<TypeParam>(0.5);
+    lsq_moments_h[moments_at(i, 0, 4)] = static_cast<TypeParam>(0.6);
+    lsq_moments_h[moments_at(i, 0, 5)] = static_cast<TypeParam>(0.7);
+    lsq_moments_h[moments_at(i, 0, 6)] = static_cast<TypeParam>(0.8);
+    lsq_moments_h[moments_at(i, 0, 7)] = static_cast<TypeParam>(0.9);
+    lsq_moments_h[moments_at(i, 0, 8)] = static_cast<TypeParam>(1.0);
   }
 
+  // Copy to device
+  Kokkos::deep_copy(this->p_cc, p_cc_h);
+  Kokkos::deep_copy(this->cell_neighbor_idx, cell_neighbor_idx_h);
+  Kokkos::deep_copy(this->cell_neighbor_blk, cell_neighbor_blk_h);
+  Kokkos::deep_copy(this->lsq_qtmat_c, lsq_qtmat_c_h);
+  Kokkos::deep_copy(this->lsq_rmat_rdiag_c, lsq_rmat_rdiag_c_h);
+  Kokkos::deep_copy(this->lsq_rmat_utri_c, lsq_rmat_utri_c_h);
+  Kokkos::deep_copy(this->lsq_moments, lsq_moments_h);
+  Kokkos::deep_copy(this->p_coeff, p_coeff_h);
+
   // Test function
   recon_lsq_cell_c<TypeParam>(
       this->p_cc.data(), this->cell_neighbor_idx.data(),
@@ -867,37 +1135,40 @@ TYPED_TEST(HorizontalReconCubicTest, TestLsqCell) {
       this->lacc, this->nblks_c, this->nlev, this->lsq_dim_unk,
       this->lsq_dim_c);
 
+  // Copy results back to host
+  Kokkos::deep_copy(p_coeff_h, this->p_coeff);
+
   // Check result
   EXPECT_NEAR(
-      this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(0, 0, 0, 0))],
-      0.28, 1e-6);
+      p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(0, 0, 0, 0))],
+      static_cast<TypeParam>(0.28), 1e-6);
   EXPECT_NEAR(
-      this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(1, 0, 0, 0))],
-      0.4, 1e-6);
+      p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(1, 0, 0, 0))],
+      static_cast<TypeParam>(0.4), 1e-6);
   EXPECT_NEAR(
-      this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(2, 0, 0, 0))],
-      -0.2, 1e-6);
+      p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(2, 0, 0, 0))],
+      static_cast<TypeParam>(-0.2), 1e-6);
   EXPECT_NEAR(
-      this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(3, 0, 0, 0))],
-      0.4, 1e-6);
+      p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(3, 0, 0, 0))],
+      static_cast<TypeParam>(0.4), 1e-6);
   EXPECT_NEAR(
-      this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(4, 0, 0, 0))],
-      -0.2, 1e-6);
+      p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(4, 0, 0, 0))],
+      static_cast<TypeParam>(-0.2), 1e-6);
   EXPECT_NEAR(
-      this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(5, 0, 0, 0))],
-      0.4, 1e-6);
+      p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(5, 0, 0, 0))],
+      static_cast<TypeParam>(0.4), 1e-6);
   EXPECT_NEAR(
-      this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(6, 0, 0, 0))],
-      -0.2, 1e-6);
+      p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(6, 0, 0, 0))],
+      static_cast<TypeParam>(-0.2), 1e-6);
   EXPECT_NEAR(
-      this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(7, 0, 0, 0))],
-      0.4, 1e-6);
+      p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(7, 0, 0, 0))],
+      static_cast<TypeParam>(0.4), 1e-6);
   EXPECT_NEAR(
-      this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(8, 0, 0, 0))],
-      -0.2, 1e-6);
+      p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(8, 0, 0, 0))],
+      static_cast<TypeParam>(-0.2), 1e-6);
   EXPECT_NEAR(
-      this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(9, 0, 0, 0))],
-      0.4, 1e-6);
+      p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(9, 0, 0, 0))],
+      static_cast<TypeParam>(0.4), 1e-6);
 }
 
 TYPED_TEST(HorizontalReconCubicTest, TestLsqCellRandom) {
@@ -916,6 +1187,17 @@ TYPED_TEST(HorizontalReconCubicTest, TestLsqCellRandom) {
       at<nproma, (lsq_dim_unk * lsq_dim_unk - lsq_dim_unk) / 2, nblks_c>;
   const auto &moments_at = at<nproma, nblks_c, lsq_dim_unk>;
 
+  auto p_cc_h = Kokkos::create_mirror_view(this->p_cc);
+  auto cell_neighbor_idx_h =
+      Kokkos::create_mirror_view(this->cell_neighbor_idx);
+  auto cell_neighbor_blk_h =
+      Kokkos::create_mirror_view(this->cell_neighbor_blk);
+  auto lsq_qtmat_c_h = Kokkos::create_mirror_view(this->lsq_qtmat_c);
+  auto lsq_rmat_rdiag_c_h = Kokkos::create_mirror_view(this->lsq_rmat_rdiag_c);
+  auto lsq_rmat_utri_c_h = Kokkos::create_mirror_view(this->lsq_rmat_utri_c);
+  auto lsq_moments_h = Kokkos::create_mirror_view(this->lsq_moments);
+  auto p_coeff_h = Kokkos::create_mirror_view(this->p_coeff);
+
   std::random_device rd;
   std::mt19937 gen(rd());
   std::uniform_int_distribution<int> int_distrib(0, nproma - 1);
@@ -923,29 +1205,39 @@ TYPED_TEST(HorizontalReconCubicTest, TestLsqCellRandom) {
 
   // Initialization
   for (int i = 0; i < nproma; ++i) {
-    this->p_cc[p_cc_at(i, 0, 0)] = real_distrib(gen);
+    p_cc_h[p_cc_at(i, 0, 0)] = real_distrib(gen);
 
     for (int j = 0; j < lsq_dim_c; ++j) {
-      this->cell_neighbor_idx[cell_neighbor_at(i, 0, j)] = int_distrib(gen);
-      this->cell_neighbor_blk[cell_neighbor_at(i, 0, j)] = 0;
+      cell_neighbor_idx_h[cell_neighbor_at(i, 0, j)] = int_distrib(gen);
+      cell_neighbor_blk_h[cell_neighbor_at(i, 0, j)] = 0;
     }
 
     for (int j = 0; j < lsq_dim_unk; ++j) {
       for (int k = 0; k < lsq_dim_c; ++k) {
-        this->lsq_qtmat_c[qtmat_at(i, j, k, 0)] = real_distrib(gen);
+        lsq_qtmat_c_h[qtmat_at(i, j, k, 0)] = real_distrib(gen);
       }
-      this->lsq_rmat_rdiag_c[rmat_rdiag_at(i, j, 0)] = real_distrib(gen);
-      this->lsq_moments[moments_at(i, 0, j)] = real_distrib(gen);
+      lsq_rmat_rdiag_c_h[rmat_rdiag_at(i, j, 0)] = real_distrib(gen);
+      lsq_moments_h[moments_at(i, 0, j)] = real_distrib(gen);
     }
     for (int j = 0; j < (lsq_dim_unk * lsq_dim_unk - lsq_dim_unk) / 2; ++j) {
-      this->lsq_rmat_utri_c[rmat_utri_at(i, j, 0)] = real_distrib(gen);
+      lsq_rmat_utri_c_h[rmat_utri_at(i, j, 0)] = real_distrib(gen);
     }
 
     for (int j = 0; j < lsq_dim_unk + 1; ++j) {
-      this->p_coeff[p_coeff_at(j, i, 0, 0)] = real_distrib(gen);
+      p_coeff_h[p_coeff_at(j, i, 0, 0)] = static_cast<TypeParam>(0.0);
     }
   }
 
+  // Copy to device
+  Kokkos::deep_copy(this->p_cc, p_cc_h);
+  Kokkos::deep_copy(this->cell_neighbor_idx, cell_neighbor_idx_h);
+  Kokkos::deep_copy(this->cell_neighbor_blk, cell_neighbor_blk_h);
+  Kokkos::deep_copy(this->lsq_qtmat_c, lsq_qtmat_c_h);
+  Kokkos::deep_copy(this->lsq_rmat_rdiag_c, lsq_rmat_rdiag_c_h);
+  Kokkos::deep_copy(this->lsq_rmat_utri_c, lsq_rmat_utri_c_h);
+  Kokkos::deep_copy(this->lsq_moments, lsq_moments_h);
+  Kokkos::deep_copy(this->p_coeff, p_coeff_h);
+
   // Test function
   recon_lsq_cell_c<TypeParam>(
       this->p_cc.data(), this->cell_neighbor_idx.data(),
@@ -957,6 +1249,9 @@ TYPED_TEST(HorizontalReconCubicTest, TestLsqCellRandom) {
       this->lacc, this->nblks_c, this->nlev, this->lsq_dim_unk,
       this->lsq_dim_c);
 
+  // copy results back to host
+  Kokkos::deep_copy(p_coeff_h, this->p_coeff);
+
   // Compute reference result
   std::vector<TypeParam> z_d(lsq_dim_c);
   std::vector<TypeParam> z_qt_times_d(lsq_dim_unk);
@@ -968,16 +1263,15 @@ TYPED_TEST(HorizontalReconCubicTest, TestLsqCellRandom) {
     for (int jk = this->slev; jk < this->elev; ++jk) {
       for (int jc = i_startidx; jc < i_endidx; ++jc) {
         for (int i = 0; i < lsq_dim_c; ++i) {
-          z_d[i] = this->p_cc[p_cc_at(
-                       this->cell_neighbor_idx[cell_neighbor_at(jc, jb, i)], jk,
-                       this->cell_neighbor_blk[cell_neighbor_at(jc, jb, i)])] -
-                   this->p_cc[p_cc_at(jc, jk, jb)];
+          z_d[i] = p_cc_h[p_cc_at(
+                       cell_neighbor_idx_h[cell_neighbor_at(jc, jb, i)], jk,
+                       cell_neighbor_blk_h[cell_neighbor_at(jc, jb, i)])] -
+                   p_cc_h[p_cc_at(jc, jk, jb)];
         }
         for (int j = 0; j < lsq_dim_unk; ++j) {
           z_qt_times_d[j] = 0.0;
           for (int i = 0; i < lsq_dim_c; ++i) {
-            z_qt_times_d[j] +=
-                this->lsq_qtmat_c[qtmat_at(jc, j, i, jb)] * z_d[i];
+            z_qt_times_d[j] += lsq_qtmat_c_h[qtmat_at(jc, j, i, jb)] * z_d[i];
           }
         }
         int utri_id = 0;
@@ -985,29 +1279,29 @@ TYPED_TEST(HorizontalReconCubicTest, TestLsqCellRandom) {
           p_result[at<lsq_dim_unk + 1, nproma>(j, jc)] = z_qt_times_d[j - 1];
           for (int k = j + 1; k <= lsq_dim_unk; ++k) {
             p_result[at<lsq_dim_unk + 1, nproma>(j, jc)] -=
-                this->lsq_rmat_utri_c[rmat_utri_at(jc, utri_id++, jb)] *
+                lsq_rmat_utri_c_h[rmat_utri_at(jc, utri_id++, jb)] *
                 p_result[at<lsq_dim_unk + 1, nproma>(k, jc)];
           }
           p_result[at<lsq_dim_unk + 1, nproma>(j, jc)] *=
-              this->lsq_rmat_rdiag_c[rmat_rdiag_at(jc, j - 1, jb)];
+              lsq_rmat_rdiag_c_h[rmat_rdiag_at(jc, j - 1, jb)];
         }
         p_result[at<lsq_dim_unk + 1, nproma>(0, jc)] =
-            this->p_cc[p_cc_at(jc, jk, jb)];
+            p_cc_h[p_cc_at(jc, jk, jb)];
         for (int j = 0; j < lsq_dim_unk; ++j) {
           p_result[at<lsq_dim_unk + 1, nproma>(0, jc)] -=
               p_result[at<lsq_dim_unk + 1, nproma>(j + 1, jc)] *
-              this->lsq_moments[moments_at(jc, jb, j)];
+              lsq_moments_h[moments_at(jc, jb, j)];
         }
       }
     }
   }
 
   // Check result
-  for (int i = 0; i < lsq_dim_unk + 1; ++i) {
+  for (int j = 0; j < lsq_dim_unk + 1; ++j) {
     for (int jc = 0; jc < nproma; ++jc) {
-      EXPECT_NEAR(this->p_coeff[(p_coeff_at(i, jc, 0, 0))],
-                  p_result[(at<lsq_dim_unk + 1, nproma>(i, jc))], 1e-5)
-          << "For loop result fails for i = " << i << ", jc = " << jc;
+      EXPECT_NEAR(p_coeff_h[(p_coeff_at(j, jc, 0, 0))],
+                  p_result[(at<lsq_dim_unk + 1, nproma>(j, jc))], 1e-5)
+          << "For loop result fails for j = " << j << ", jc = " << jc;
     }
   }
 }
@@ -1025,43 +1319,61 @@ TYPED_TEST(HorizontalReconCubicTest, TestLsqCellSVD) {
   const auto &p_coeff_at = at<lsq_dim_unk + 1, nproma, nlev, nblks_c>;
   const auto &moments_at = at<nproma, nblks_c, lsq_dim_unk>;
 
+  // Create host mirror views
+  auto p_cc_h = Kokkos::create_mirror_view(this->p_cc);
+  auto cell_neighbor_idx_h =
+      Kokkos::create_mirror_view(this->cell_neighbor_idx);
+  auto cell_neighbor_blk_h =
+      Kokkos::create_mirror_view(this->cell_neighbor_blk);
+  auto lsq_pseudoinv_h = Kokkos::create_mirror_view(this->lsq_pseudoinv);
+  auto lsq_moments_h = Kokkos::create_mirror_view(this->lsq_moments);
+  auto p_coeff_h = Kokkos::create_mirror_view(this->p_coeff);
+
   // Initialization
   for (int i = 0; i < nproma; ++i) {
-    this->p_cc[p_cc_at(i, 0, 0)] = (i + 1);
+    p_cc_h[p_cc_at(i, 0, 0)] = static_cast<TypeParam>(i + 1);
 
-    this->cell_neighbor_idx[cell_neighbor_at(i, 0, 0)] = (i + 1) % nproma;
-    this->cell_neighbor_blk[cell_neighbor_at(i, 0, 0)] = 0;
+    cell_neighbor_idx_h[cell_neighbor_at(i, 0, 0)] = (i + 1) % nproma;
+    cell_neighbor_blk_h[cell_neighbor_at(i, 0, 0)] = 0;
     for (int j = 1; j < lsq_dim_c; ++j) {
-      this->cell_neighbor_idx[cell_neighbor_at(i, 0, j)] = i;
-      this->cell_neighbor_blk[cell_neighbor_at(i, 0, j)] = 0;
+      cell_neighbor_idx_h[cell_neighbor_at(i, 0, j)] = i;
+      cell_neighbor_blk_h[cell_neighbor_at(i, 0, j)] = 0;
     }
 
     for (int j = 0; j < lsq_dim_c; ++j) {
-      this->lsq_pseudoinv[pseudoinv_at(i, 0, j, 0)] = 1.0;
-      this->lsq_pseudoinv[pseudoinv_at(i, 1, j, 0)] = 0.9;
-      this->lsq_pseudoinv[pseudoinv_at(i, 2, j, 0)] = 0.8;
-      this->lsq_pseudoinv[pseudoinv_at(i, 3, j, 0)] = 0.7;
-      this->lsq_pseudoinv[pseudoinv_at(i, 4, j, 0)] = 0.6;
-      this->lsq_pseudoinv[pseudoinv_at(i, 5, j, 0)] = 0.5;
-      this->lsq_pseudoinv[pseudoinv_at(i, 6, j, 0)] = 0.4;
-      this->lsq_pseudoinv[pseudoinv_at(i, 7, j, 0)] = 0.3;
-      this->lsq_pseudoinv[pseudoinv_at(i, 8, j, 0)] = 0.2;
+      lsq_pseudoinv_h[pseudoinv_at(i, 0, j, 0)] = static_cast<TypeParam>(1.0);
+      lsq_pseudoinv_h[pseudoinv_at(i, 1, j, 0)] = static_cast<TypeParam>(0.9);
+      lsq_pseudoinv_h[pseudoinv_at(i, 2, j, 0)] = static_cast<TypeParam>(0.8);
+      lsq_pseudoinv_h[pseudoinv_at(i, 3, j, 0)] = static_cast<TypeParam>(0.7);
+      lsq_pseudoinv_h[pseudoinv_at(i, 4, j, 0)] = static_cast<TypeParam>(0.6);
+      lsq_pseudoinv_h[pseudoinv_at(i, 5, j, 0)] = static_cast<TypeParam>(0.5);
+      lsq_pseudoinv_h[pseudoinv_at(i, 6, j, 0)] = static_cast<TypeParam>(0.4);
+      lsq_pseudoinv_h[pseudoinv_at(i, 7, j, 0)] = static_cast<TypeParam>(0.3);
+      lsq_pseudoinv_h[pseudoinv_at(i, 8, j, 0)] = static_cast<TypeParam>(0.2);
     }
     for (int j = 0; j < lsq_dim_unk + 1; ++j) {
-      this->p_coeff[p_coeff_at(j, i, 0, 0)] = 0.0;
+      p_coeff_h[p_coeff_at(j, i, 0, 0)] = static_cast<TypeParam>(0.0);
     }
 
-    this->lsq_moments[moments_at(i, 0, 0)] = 0.2;
-    this->lsq_moments[moments_at(i, 0, 1)] = 0.3;
-    this->lsq_moments[moments_at(i, 0, 2)] = 0.4;
-    this->lsq_moments[moments_at(i, 0, 3)] = 0.5;
-    this->lsq_moments[moments_at(i, 0, 4)] = 0.6;
-    this->lsq_moments[moments_at(i, 0, 5)] = 0.7;
-    this->lsq_moments[moments_at(i, 0, 6)] = 0.8;
-    this->lsq_moments[moments_at(i, 0, 7)] = 0.9;
-    this->lsq_moments[moments_at(i, 0, 8)] = 1.0;
+    lsq_moments_h[moments_at(i, 0, 0)] = static_cast<TypeParam>(0.2);
+    lsq_moments_h[moments_at(i, 0, 1)] = static_cast<TypeParam>(0.3);
+    lsq_moments_h[moments_at(i, 0, 2)] = static_cast<TypeParam>(0.4);
+    lsq_moments_h[moments_at(i, 0, 3)] = static_cast<TypeParam>(0.5);
+    lsq_moments_h[moments_at(i, 0, 4)] = static_cast<TypeParam>(0.6);
+    lsq_moments_h[moments_at(i, 0, 5)] = static_cast<TypeParam>(0.7);
+    lsq_moments_h[moments_at(i, 0, 6)] = static_cast<TypeParam>(0.8);
+    lsq_moments_h[moments_at(i, 0, 7)] = static_cast<TypeParam>(0.9);
+    lsq_moments_h[moments_at(i, 0, 8)] = static_cast<TypeParam>(1.0);
   }
 
+  // Copy to device
+  Kokkos::deep_copy(this->p_cc, p_cc_h);
+  Kokkos::deep_copy(this->cell_neighbor_idx, cell_neighbor_idx_h);
+  Kokkos::deep_copy(this->cell_neighbor_blk, cell_neighbor_blk_h);
+  Kokkos::deep_copy(this->lsq_pseudoinv, lsq_pseudoinv_h);
+  Kokkos::deep_copy(this->lsq_moments, lsq_moments_h);
+  Kokkos::deep_copy(this->p_coeff, p_coeff_h);
+
   // Test function
   recon_lsq_cell_c_svd<TypeParam>(
       this->p_cc.data(), this->cell_neighbor_idx.data(),
@@ -1072,37 +1384,40 @@ TYPED_TEST(HorizontalReconCubicTest, TestLsqCellSVD) {
       this->lacc, this->nblks_c, this->nlev, this->lsq_dim_unk,
       this->lsq_dim_c);
 
+  // Copy results back to host
+  Kokkos::deep_copy(p_coeff_h, this->p_coeff);
+
   // Check result
   EXPECT_NEAR(
-      this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(0, 0, 0, 0))],
-      -1.64, 1e-6);
+      p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(0, 0, 0, 0))],
+      static_cast<TypeParam>(-1.64), 1e-6);
   EXPECT_NEAR(
-      this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(1, 0, 0, 0))],
-      1.0, 1e-6);
+      p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(1, 0, 0, 0))],
+      static_cast<TypeParam>(1.0), 1e-6);
   EXPECT_NEAR(
-      this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(2, 0, 0, 0))],
-      0.9, 1e-6);
+      p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(2, 0, 0, 0))],
+      static_cast<TypeParam>(0.9), 1e-6);
   EXPECT_NEAR(
-      this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(3, 0, 0, 0))],
-      0.8, 1e-6);
+      p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(3, 0, 0, 0))],
+      static_cast<TypeParam>(0.8), 1e-6);
   EXPECT_NEAR(
-      this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(4, 0, 0, 0))],
-      0.7, 1e-6);
+      p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(4, 0, 0, 0))],
+      static_cast<TypeParam>(0.7), 1e-6);
   EXPECT_NEAR(
-      this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(5, 0, 0, 0))],
-      0.6, 1e-6);
+      p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(5, 0, 0, 0))],
+      static_cast<TypeParam>(0.6), 1e-6);
   EXPECT_NEAR(
-      this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(6, 0, 0, 0))],
-      0.5, 1e-6);
+      p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(6, 0, 0, 0))],
+      static_cast<TypeParam>(0.5), 1e-6);
   EXPECT_NEAR(
-      this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(7, 0, 0, 0))],
-      0.4, 1e-6);
+      p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(7, 0, 0, 0))],
+      static_cast<TypeParam>(0.4), 1e-6);
   EXPECT_NEAR(
-      this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(8, 0, 0, 0))],
-      0.3, 1e-6);
+      p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(8, 0, 0, 0))],
+      static_cast<TypeParam>(0.3), 1e-6);
   EXPECT_NEAR(
-      this->p_coeff[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(9, 0, 0, 0))],
-      0.2, 1e-6);
+      p_coeff_h[(at<lsq_dim_unk + 1, nproma, nlev, nblks_c>(9, 0, 0, 0))],
+      static_cast<TypeParam>(0.2), 1e-6);
 }
 
 TYPED_TEST(HorizontalReconCubicTest, TestLsqCellSVDRandom) {
@@ -1118,6 +1433,16 @@ TYPED_TEST(HorizontalReconCubicTest, TestLsqCellSVDRandom) {
   const auto &p_coeff_at = at<lsq_dim_unk + 1, nproma, nlev, nblks_c>;
   const auto &moments_at = at<nproma, nblks_c, lsq_dim_unk>;
 
+  // Create host mirror views
+  auto p_cc_h = Kokkos::create_mirror_view(this->p_cc);
+  auto cell_neighbor_idx_h =
+      Kokkos::create_mirror_view(this->cell_neighbor_idx);
+  auto cell_neighbor_blk_h =
+      Kokkos::create_mirror_view(this->cell_neighbor_blk);
+  auto lsq_pseudoinv_h = Kokkos::create_mirror_view(this->lsq_pseudoinv);
+  auto lsq_moments_h = Kokkos::create_mirror_view(this->lsq_moments);
+  auto p_coeff_h = Kokkos::create_mirror_view(this->p_coeff);
+
   std::random_device rd;
   std::mt19937 gen(rd());
   std::uniform_int_distribution<int> int_distrib(0, nproma - 1);
@@ -1125,25 +1450,33 @@ TYPED_TEST(HorizontalReconCubicTest, TestLsqCellSVDRandom) {
 
   // Initialization
   for (int i = 0; i < nproma; ++i) {
-    this->p_cc[p_cc_at(i, 0, 0)] = real_distrib(gen);
+    p_cc_h[p_cc_at(i, 0, 0)] = real_distrib(gen);
 
     for (int j = 0; j < lsq_dim_c; ++j) {
-      this->cell_neighbor_idx[cell_neighbor_at(i, 0, j)] = int_distrib(gen);
-      this->cell_neighbor_blk[cell_neighbor_at(i, 0, j)] = 0;
+      cell_neighbor_idx_h[cell_neighbor_at(i, 0, j)] = int_distrib(gen);
+      cell_neighbor_blk_h[cell_neighbor_at(i, 0, j)] = 0;
     }
 
     for (int j = 0; j < lsq_dim_unk; ++j) {
       for (int k = 0; k < lsq_dim_c; ++k) {
-        this->lsq_pseudoinv[pseudoinv_at(i, j, k, 0)] = real_distrib(gen);
+        lsq_pseudoinv_h[pseudoinv_at(i, j, k, 0)] = real_distrib(gen);
       }
-      this->lsq_moments[moments_at(i, 0, j)] = real_distrib(gen);
+      lsq_moments_h[moments_at(i, 0, j)] = real_distrib(gen);
     }
 
     for (int j = 0; j < lsq_dim_unk + 1; ++j) {
-      this->p_coeff[p_coeff_at(j, i, 0, 0)] = real_distrib(gen);
+      p_coeff_h[p_coeff_at(j, i, 0, 0)] = static_cast<TypeParam>(0.0);
     }
   }
 
+  // Copy to device
+  Kokkos::deep_copy(this->p_cc, p_cc_h);
+  Kokkos::deep_copy(this->cell_neighbor_idx, cell_neighbor_idx_h);
+  Kokkos::deep_copy(this->cell_neighbor_blk, cell_neighbor_blk_h);
+  Kokkos::deep_copy(this->lsq_pseudoinv, lsq_pseudoinv_h);
+  Kokkos::deep_copy(this->lsq_moments, lsq_moments_h);
+  Kokkos::deep_copy(this->p_coeff, p_coeff_h);
+
   // Test function
   recon_lsq_cell_c_svd<TypeParam>(
       this->p_cc.data(), this->cell_neighbor_idx.data(),
@@ -1154,6 +1487,9 @@ TYPED_TEST(HorizontalReconCubicTest, TestLsqCellSVDRandom) {
       this->lacc, this->nblks_c, this->nlev, this->lsq_dim_unk,
       this->lsq_dim_c);
 
+  // Copy results back to host
+  Kokkos::deep_copy(p_coeff_h, this->p_coeff);
+
   // Compute reference result
   std::vector<TypeParam> z_d(lsq_dim_c);
   std::vector<TypeParam> z_qt_times_d(lsq_dim_unk);
@@ -1166,32 +1502,33 @@ TYPED_TEST(HorizontalReconCubicTest, TestLsqCellSVDRandom) {
     for (int jk = this->slev; jk < this->elev; ++jk) {
       for (int jc = i_startidx; jc < i_endidx; ++jc) {
         for (int i = 0; i < lsq_dim_c; ++i) {
-          z_d[i] = this->p_cc[p_cc_at(
-                       this->cell_neighbor_idx[cell_neighbor_at(jc, jb, i)], jk,
-                       this->cell_neighbor_blk[cell_neighbor_at(jc, jb, i)])] -
-                   this->p_cc[p_cc_at(jc, jk, jb)];
+          z_d[i] = p_cc_h[p_cc_at(
+                       cell_neighbor_idx_h[cell_neighbor_at(jc, jb, i)], jk,
+                       cell_neighbor_blk_h[cell_neighbor_at(jc, jb, i)])] -
+                   p_cc_h[p_cc_at(jc, jk, jb)];
         }
         for (int j = 1; j < lsq_dim_unk + 1; ++j) {
           p_result[at<lsq_dim_unk + 1, nproma>(j, jc)] = 0.0;
           for (int i = 0; i < lsq_dim_c; ++i) {
             p_result[at<lsq_dim_unk + 1, nproma>(j, jc)] +=
-                this->lsq_pseudoinv[pseudoinv_at(jc, j - 1, i, jb)] * z_d[i];
+                lsq_pseudoinv_h[pseudoinv_at(jc, j - 1, i, jb)] * z_d[i];
           }
         }
         p_result[at<lsq_dim_unk + 1, nproma>(0, jc)] =
-            this->p_cc[p_cc_at(jc, jk, jb)];
+            p_cc_h[p_cc_at(jc, jk, jb)];
         for (int j = 0; j < lsq_dim_unk; ++j) {
           p_result[at<lsq_dim_unk + 1, nproma>(0, jc)] -=
               p_result[at<lsq_dim_unk + 1, nproma>(j + 1, jc)] *
-              this->lsq_moments[moments_at(jc, jb, j)];
+              lsq_moments_h[moments_at(jc, jb, j)];
         }
       }
     }
   }
+
   // Check result
   for (int i = 0; i < lsq_dim_unk + 1; ++i) {
     for (int jc = 0; jc < nproma; ++jc) {
-      EXPECT_NEAR(this->p_coeff[(p_coeff_at(i, jc, 0, 0))],
+      EXPECT_NEAR(p_coeff_h[(p_coeff_at(i, jc, 0, 0))],
                   p_result[(at<lsq_dim_unk + 1, nproma>(i, jc))], 1e-5)
           << "For loop result fails for i = " << i << ", jc = " << jc;
     }
-- 
GitLab


From 97577dd3de83fa953eae5cdc4704142241d1f370 Mon Sep 17 00:00:00 2001
From: Pradipta Samanta <samanta@dkrz.de>
Date: Thu, 27 Mar 2025 11:58:57 +0100
Subject: [PATCH 07/34] modified the tests in horizontal_rot

---
 test/c/CMakeLists.txt          |   2 +-
 test/c/test_horizontal_rot.cpp | 301 ++++++++++++++++++++-------------
 2 files changed, 186 insertions(+), 117 deletions(-)

diff --git a/test/c/CMakeLists.txt b/test/c/CMakeLists.txt
index f4c5e27..2dd32f4 100644
--- a/test/c/CMakeLists.txt
+++ b/test/c/CMakeLists.txt
@@ -33,8 +33,8 @@ endif()
 set(SOURCES
   main.cpp
   test_horizontal_div.cpp
-  # test_horizontal_rot.cpp
   test_horizontal_recon.cpp
+  test_horizontal_rot.cpp
   # test_tdma_solver.cpp
   # test_interpolation_vector.cpp
   # test_intp_rbf.cpp
diff --git a/test/c/test_horizontal_rot.cpp b/test/c/test_horizontal_rot.cpp
index 68e8024..2c8fc46 100644
--- a/test/c/test_horizontal_rot.cpp
+++ b/test/c/test_horizontal_rot.cpp
@@ -14,8 +14,8 @@
 #include <vector>
 
 #include <Kokkos_Core.hpp>
-#include <gtest/gtest.h>
 #include <dim_helper.hpp>
+#include <gtest/gtest.h>
 #include <horizontal/mo_lib_divrot.hpp>
 #include <support/mo_lib_loopindices.hpp>
 
@@ -38,25 +38,31 @@ protected:
   bool lacc = false;      // Not using ACC-specific behavior.
   bool acc_async = false; // Not using ACC-specific behavior.
 
-  std::vector<ValueType> vec_e;
-  std::vector<int> vert_edge_idx;
-  std::vector<int> vert_edge_blk;
-  std::vector<ValueType> geofac_rot;
-  std::vector<ValueType> rot_vec;
-  std::vector<ValueType> f4din;
-  std::vector<ValueType> f4dout;
-
-  HorizontalRotVertexTest() {
+  using exec_space = Kokkos::DefaultExecutionSpace;
+  using memory_space = exec_space::memory_space;
+
+  // Views for the test data. All the data is assigned as one-dimensional arrays
+  Kokkos::View<ValueType *, memory_space> vec_e;
+  Kokkos::View<int *, memory_space> vert_edge_idx;
+  Kokkos::View<int *, memory_space> vert_edge_blk;
+  Kokkos::View<ValueType *, memory_space> geofac_rot;
+  Kokkos::View<ValueType *, memory_space> rot_vec;
+  Kokkos::View<ValueType *, memory_space> f4din;
+  Kokkos::View<ValueType *, memory_space> f4dout;
+
+  HorizontalRotVertexTest()
+      : vec_e("vec_e", dim_combine(nproma, nlev, nblks_e)),
+        vert_edge_idx("vert_edge_idx", dim_combine(nproma, nblks_v, 6)),
+        vert_edge_blk("vert_edge_blk", dim_combine(nproma, nblks_v, 6)),
+        geofac_rot("geofac_rot", dim_combine(nproma, 6, nblks_v)),
+        rot_vec("rot_vec", dim_combine(nproma, nlev, nblks_v)),
+        f4din("f4din", dim_combine(nproma, nlev, nblks_e, dim4d)),
+        f4dout("f4dout", dim_combine(nproma, nlev, nblks_v, dim4d)) {
+
+    // We keep slev and elev as std::vector since they are small and used only
+    // on the host.
     slev.resize(dim4d, 0);
     elev.resize(dim4d, nlev); // Full vertical range (0 .. nlev-1)
-
-    vec_e.resize(dim_combine(nproma, nlev, nblks_e));
-    vert_edge_idx.resize(dim_combine(nproma, nblks_v, 6));
-    vert_edge_blk.resize(dim_combine(nproma, nblks_v, 6));
-    geofac_rot.resize(dim_combine(nproma, 6, nblks_v));
-    rot_vec.resize(dim_combine(nproma, nlev, nblks_v));
-    f4din.resize(dim_combine(nproma, nlev, nblks_e, dim4d));
-    f4dout.resize(dim_combine(nproma, nlev, nblks_v, dim4d));
   }
 };
 
@@ -76,33 +82,46 @@ TYPED_TEST(HorizontalRotVertexTest, TestRotVertexAtmosSpecific) {
   const auto &geofac_rot_at = at<nproma, 6, nblks_v>;
   const auto &rot_vec_at = at<nproma, nlev, nblks_v>;
 
+  // Create host mirror views
+  auto vec_e_h = Kokkos::create_mirror_view(this->vec_e);
+  auto vert_edge_idx_h = Kokkos::create_mirror_view(this->vert_edge_idx);
+  auto vert_edge_blk_h = Kokkos::create_mirror_view(this->vert_edge_blk);
+  auto geofac_rot_h = Kokkos::create_mirror_view(this->geofac_rot);
+  auto rot_vec_h = Kokkos::create_mirror_view(this->rot_vec);
+
   // Initialization with specific values
   for (int i = 0; i < nproma; ++i) {
     for (int k = 0; k < nlev; ++k) {
-      this->vec_e[vec_e_at(i, k, 0)] = (i + 1) * (k + 1); // Simple pattern
+      vec_e_h(vec_e_at(i, k, 0)) = (i + 1) * (k + 1); // Simple pattern
     }
 
     // Set edge indices to point to specific edges
     for (int j = 0; j < 6; ++j) {
-      this->vert_edge_idx[vert_edge_at(i, 0, j)] = (i + j) % nproma;
+      vert_edge_idx_h(vert_edge_at(i, 0, j)) = (i + j) % nproma;
       // All edges are in the same block for this test
-      this->vert_edge_blk[vert_edge_at(i, 0, j)] = 0;
+      vert_edge_blk_h(vert_edge_at(i, 0, j)) = 0;
     }
 
-    // Geometric factors for rotation
-    this->geofac_rot[geofac_rot_at(i, 0, 0)] = 0.3;
-    this->geofac_rot[geofac_rot_at(i, 1, 0)] = 0.2;
-    this->geofac_rot[geofac_rot_at(i, 2, 0)] = 0.1;
-    this->geofac_rot[geofac_rot_at(i, 3, 0)] = 0.2;
-    this->geofac_rot[geofac_rot_at(i, 4, 0)] = 0.1;
-    this->geofac_rot[geofac_rot_at(i, 5, 0)] = 0.1;
+    geofac_rot_h(geofac_rot_at(i, 0, 0)) = 0.3;
+    geofac_rot_h(geofac_rot_at(i, 1, 0)) = 0.2;
+    geofac_rot_h(geofac_rot_at(i, 2, 0)) = 0.1;
+    geofac_rot_h(geofac_rot_at(i, 3, 0)) = 0.2;
+    geofac_rot_h(geofac_rot_at(i, 4, 0)) = 0.1;
+    geofac_rot_h(geofac_rot_at(i, 5, 0)) = 0.1;
 
     // Initialize rot_vec to zero
     for (int k = 0; k < nlev; ++k) {
-      this->rot_vec[rot_vec_at(i, k, 0)] = 0.0;
+      rot_vec_h(rot_vec_at(i, k, 0)) = 0.0;
     }
   }
 
+  // Copy initialized data to device
+  Kokkos::deep_copy(this->vec_e, vec_e_h);
+  Kokkos::deep_copy(this->vert_edge_idx, vert_edge_idx_h);
+  Kokkos::deep_copy(this->vert_edge_blk, vert_edge_blk_h);
+  Kokkos::deep_copy(this->geofac_rot, geofac_rot_h);
+  Kokkos::deep_copy(this->rot_vec, rot_vec_h);
+
   // Call the rot_vertex_atmos function
   rot_vertex_atmos<TypeParam>(
       this->vec_e.data(), this->vert_edge_idx.data(),
@@ -111,13 +130,22 @@ TYPED_TEST(HorizontalRotVertexTest, TestRotVertexAtmosSpecific) {
       this->slev[0], this->elev[0], this->nproma, this->lacc, this->nlev,
       this->nblks_e, this->nblks_v);
 
+  // Copy results back to host for verification
+  Kokkos::deep_copy(rot_vec_h, this->rot_vec);
+
   // Expected values based on the initialization pattern
-  EXPECT_NEAR(this->rot_vec[rot_vec_at(0, 0, 0)], 1.7, 1e-6);
-  EXPECT_NEAR(this->rot_vec[rot_vec_at(0, 1, 0)], 3.4, 1e-6);
-  EXPECT_NEAR(this->rot_vec[rot_vec_at(1, 0, 0)], 2.1, 1e-6);
-  EXPECT_NEAR(this->rot_vec[rot_vec_at(1, 1, 0)], 4.2, 1e-6);
-  EXPECT_NEAR(this->rot_vec[rot_vec_at(2, 0, 0)], 2.2, 1e-6);
-  EXPECT_NEAR(this->rot_vec[rot_vec_at(2, 1, 0)], 4.4, 1e-6);
+  EXPECT_NEAR(rot_vec_h[rot_vec_at(0, 0, 0)], static_cast<TypeParam>(1.7),
+              1e-6);
+  EXPECT_NEAR(rot_vec_h[rot_vec_at(0, 1, 0)], static_cast<TypeParam>(3.4),
+              1e-6);
+  EXPECT_NEAR(rot_vec_h[rot_vec_at(1, 0, 0)], static_cast<TypeParam>(2.1),
+              1e-6);
+  EXPECT_NEAR(rot_vec_h[rot_vec_at(1, 1, 0)], static_cast<TypeParam>(4.2),
+              1e-6);
+  EXPECT_NEAR(rot_vec_h[rot_vec_at(2, 0, 0)], static_cast<TypeParam>(2.2),
+              1e-6);
+  EXPECT_NEAR(rot_vec_h[rot_vec_at(2, 1, 0)], static_cast<TypeParam>(4.4),
+              1e-6);
 }
 
 TYPED_TEST(HorizontalRotVertexTest, TestRotVertexAtmosRandom) {
@@ -131,6 +159,13 @@ TYPED_TEST(HorizontalRotVertexTest, TestRotVertexAtmosRandom) {
   const auto &geofac_rot_at = at<nproma, 6, nblks_v>;
   const auto &rot_vec_at = at<nproma, nlev, nblks_v>;
 
+  // Create host mirror views
+  auto vec_e_h = Kokkos::create_mirror_view(this->vec_e);
+  auto vert_edge_idx_h = Kokkos::create_mirror_view(this->vert_edge_idx);
+  auto vert_edge_blk_h = Kokkos::create_mirror_view(this->vert_edge_blk);
+  auto geofac_rot_h = Kokkos::create_mirror_view(this->geofac_rot);
+  auto rot_vec_h = Kokkos::create_mirror_view(this->rot_vec);
+
   // Set up random number generators
   std::random_device rd;
   std::mt19937 gen(rd());
@@ -140,27 +175,34 @@ TYPED_TEST(HorizontalRotVertexTest, TestRotVertexAtmosRandom) {
   // Initialization with random values
   for (int i = 0; i < nproma; ++i) {
     for (int k = 0; k < nlev; ++k) {
-      this->vec_e[vec_e_at(i, k, 0)] = real_distrib(gen);
+      vec_e_h(vec_e_at(i, k, 0)) = real_distrib(gen);
     }
 
     // Set random edge indices
     for (int j = 0; j < 6; ++j) {
-      this->vert_edge_idx[vert_edge_at(i, 0, j)] = int_distrib(gen);
-      this->vert_edge_blk[vert_edge_at(i, 0, j)] =
+      vert_edge_idx_h(vert_edge_at(i, 0, j)) = int_distrib(gen);
+      vert_edge_blk_h(vert_edge_at(i, 0, j)) =
           0; // Keep in same block for simplicity
     }
 
     // Random geometric factors
     for (int j = 0; j < 6; ++j) {
-      this->geofac_rot[geofac_rot_at(i, j, 0)] = real_distrib(gen);
+      geofac_rot_h(geofac_rot_at(i, j, 0)) = real_distrib(gen);
     }
 
-    // Initialize rot_vec to random values
+    // Initialize rot_vec to zero
     for (int k = 0; k < nlev; ++k) {
-      this->rot_vec[rot_vec_at(i, k, 0)] = real_distrib(gen);
+      rot_vec_h(rot_vec_at(i, k, 0)) = static_cast<TypeParam>(0.0);
     }
   }
 
+  // Copy initialized data to device
+  Kokkos::deep_copy(this->vec_e, vec_e_h);
+  Kokkos::deep_copy(this->vert_edge_idx, vert_edge_idx_h);
+  Kokkos::deep_copy(this->vert_edge_blk, vert_edge_blk_h);
+  Kokkos::deep_copy(this->geofac_rot, geofac_rot_h);
+  Kokkos::deep_copy(this->rot_vec, rot_vec_h);
+
   // Call the rot_vertex_atmos function
   rot_vertex_atmos<TypeParam>(
       this->vec_e.data(), this->vert_edge_idx.data(),
@@ -169,6 +211,9 @@ TYPED_TEST(HorizontalRotVertexTest, TestRotVertexAtmosRandom) {
       this->slev[0], this->elev[0], this->nproma, this->lacc, this->nlev,
       this->nblks_e, this->nblks_v);
 
+  // Copy results back to host for verification
+  Kokkos::deep_copy(rot_vec_h, this->rot_vec);
+
   // Calculate reference values separately and verify results
   std::vector<TypeParam> ref_rot_vec(nproma * nlev * nblks_v, 0.0);
 
@@ -180,30 +225,24 @@ TYPED_TEST(HorizontalRotVertexTest, TestRotVertexAtmosRandom) {
     for (int jk = this->slev[0]; jk < this->elev[0]; ++jk) {
       for (int jv = i_startidx; jv < i_endidx; ++jv) {
         ref_rot_vec[rot_vec_at(jv, jk, jb)] =
-            this->vec_e[vec_e_at(
-                this->vert_edge_idx[vert_edge_at(jv, jb, 0)], jk,
-                this->vert_edge_blk[vert_edge_at(jv, jb, 0)])] *
-                this->geofac_rot[geofac_rot_at(jv, 0, jb)] +
-            this->vec_e[vec_e_at(
-                this->vert_edge_idx[vert_edge_at(jv, jb, 1)], jk,
-                this->vert_edge_blk[vert_edge_at(jv, jb, 1)])] *
-                this->geofac_rot[geofac_rot_at(jv, 1, jb)] +
-            this->vec_e[vec_e_at(
-                this->vert_edge_idx[vert_edge_at(jv, jb, 2)], jk,
-                this->vert_edge_blk[vert_edge_at(jv, jb, 2)])] *
-                this->geofac_rot[geofac_rot_at(jv, 2, jb)] +
-            this->vec_e[vec_e_at(
-                this->vert_edge_idx[vert_edge_at(jv, jb, 3)], jk,
-                this->vert_edge_blk[vert_edge_at(jv, jb, 3)])] *
-                this->geofac_rot[geofac_rot_at(jv, 3, jb)] +
-            this->vec_e[vec_e_at(
-                this->vert_edge_idx[vert_edge_at(jv, jb, 4)], jk,
-                this->vert_edge_blk[vert_edge_at(jv, jb, 4)])] *
-                this->geofac_rot[geofac_rot_at(jv, 4, jb)] +
-            this->vec_e[vec_e_at(
-                this->vert_edge_idx[vert_edge_at(jv, jb, 5)], jk,
-                this->vert_edge_blk[vert_edge_at(jv, jb, 5)])] *
-                this->geofac_rot[geofac_rot_at(jv, 5, jb)];
+            vec_e_h[vec_e_at(vert_edge_idx_h[vert_edge_at(jv, jb, 0)], jk,
+                             vert_edge_blk_h[vert_edge_at(jv, jb, 0)])] *
+                geofac_rot_h[geofac_rot_at(jv, 0, jb)] +
+            vec_e_h[vec_e_at(vert_edge_idx_h[vert_edge_at(jv, jb, 1)], jk,
+                             vert_edge_blk_h[vert_edge_at(jv, jb, 1)])] *
+                geofac_rot_h[geofac_rot_at(jv, 1, jb)] +
+            vec_e_h[vec_e_at(vert_edge_idx_h[vert_edge_at(jv, jb, 2)], jk,
+                             vert_edge_blk_h[vert_edge_at(jv, jb, 2)])] *
+                geofac_rot_h[geofac_rot_at(jv, 2, jb)] +
+            vec_e_h[vec_e_at(vert_edge_idx_h[vert_edge_at(jv, jb, 3)], jk,
+                             vert_edge_blk_h[vert_edge_at(jv, jb, 3)])] *
+                geofac_rot_h[geofac_rot_at(jv, 3, jb)] +
+            vec_e_h[vec_e_at(vert_edge_idx_h[vert_edge_at(jv, jb, 4)], jk,
+                             vert_edge_blk_h[vert_edge_at(jv, jb, 4)])] *
+                geofac_rot_h[geofac_rot_at(jv, 4, jb)] +
+            vec_e_h[vec_e_at(vert_edge_idx_h[vert_edge_at(jv, jb, 5)], jk,
+                             vert_edge_blk_h[vert_edge_at(jv, jb, 5)])] *
+                geofac_rot_h[geofac_rot_at(jv, 5, jb)];
       }
     }
   }
@@ -211,7 +250,7 @@ TYPED_TEST(HorizontalRotVertexTest, TestRotVertexAtmosRandom) {
   // Verify results
   for (int i = 0; i < nproma; ++i) {
     for (int k = 0; k < nlev; ++k) {
-      EXPECT_NEAR(this->rot_vec[rot_vec_at(i, k, 0)],
+      EXPECT_NEAR(rot_vec_h[rot_vec_at(i, k, 0)],
                   ref_rot_vec[rot_vec_at(i, k, 0)], 1e-5)
           << "Results differ at i=" << i << ", k=" << k;
     }
@@ -231,33 +270,47 @@ TYPED_TEST(HorizontalRotVertexTest, TestRotVertexRISpecific) {
   const auto &geofac_rot_at = at<nproma, 6, nblks_v>;
   const auto &rot_vec_at = at<nproma, nlev, nblks_v>;
 
+  // Create host mirror views
+  auto vec_e_h = Kokkos::create_mirror_view(this->vec_e);
+  auto vert_edge_idx_h = Kokkos::create_mirror_view(this->vert_edge_idx);
+  auto vert_edge_blk_h = Kokkos::create_mirror_view(this->vert_edge_blk);
+  auto geofac_rot_h = Kokkos::create_mirror_view(this->geofac_rot);
+  auto rot_vec_h = Kokkos::create_mirror_view(this->rot_vec);
+
   // Initialization with specific values
   for (int i = 0; i < nproma; ++i) {
     for (int k = 0; k < nlev; ++k) {
-      this->vec_e[vec_e_at(i, k, 0)] = (i + 1) * (k + 1); // Simple pattern
+      vec_e_h(vec_e_at(i, k, 0)) = (i + 1) * (k + 1); // Simple pattern
     }
 
     // Set edge indices to point to specific edges
     for (int j = 0; j < 6; ++j) {
-      this->vert_edge_idx[vert_edge_at(i, 0, j)] = (i + j) % nproma;
+      vert_edge_idx_h(vert_edge_at(i, 0, j)) = (i + j) % nproma;
       // All edges are in the same block for this test
-      this->vert_edge_blk[vert_edge_at(i, 0, j)] = 0;
+      vert_edge_blk_h(vert_edge_at(i, 0, j)) = 0;
     }
 
     // Geometric factors for rotation
-    this->geofac_rot[geofac_rot_at(i, 0, 0)] = 0.3;
-    this->geofac_rot[geofac_rot_at(i, 1, 0)] = 0.2;
-    this->geofac_rot[geofac_rot_at(i, 2, 0)] = 0.1;
-    this->geofac_rot[geofac_rot_at(i, 3, 0)] = 0.2;
-    this->geofac_rot[geofac_rot_at(i, 4, 0)] = 0.1;
-    this->geofac_rot[geofac_rot_at(i, 5, 0)] = 0.1;
+    geofac_rot_h(geofac_rot_at(i, 0, 0)) = 0.3;
+    geofac_rot_h(geofac_rot_at(i, 1, 0)) = 0.2;
+    geofac_rot_h(geofac_rot_at(i, 2, 0)) = 0.1;
+    geofac_rot_h(geofac_rot_at(i, 3, 0)) = 0.2;
+    geofac_rot_h(geofac_rot_at(i, 4, 0)) = 0.1;
+    geofac_rot_h(geofac_rot_at(i, 5, 0)) = 0.1;
 
     // Initialize rot_vec to zero
     for (int k = 0; k < nlev; ++k) {
-      this->rot_vec[rot_vec_at(i, k, 0)] = 0.0;
+      rot_vec_h(rot_vec_at(i, k, 0)) = 0.0;
     }
   }
 
+  // Copy initialized data to device
+  Kokkos::deep_copy(this->vec_e, vec_e_h);
+  Kokkos::deep_copy(this->vert_edge_idx, vert_edge_idx_h);
+  Kokkos::deep_copy(this->vert_edge_blk, vert_edge_blk_h);
+  Kokkos::deep_copy(this->geofac_rot, geofac_rot_h);
+  Kokkos::deep_copy(this->rot_vec, rot_vec_h);
+
   // Call the rot_vertex_ri function
   rot_vertex_ri<TypeParam>(
       this->vec_e.data(), this->vert_edge_idx.data(),
@@ -266,13 +319,22 @@ TYPED_TEST(HorizontalRotVertexTest, TestRotVertexRISpecific) {
       this->slev[0], this->elev[0], this->nproma, this->lacc, this->acc_async,
       this->nlev, this->nblks_e, this->nblks_v);
 
+  // Copy results back to host for verification
+  Kokkos::deep_copy(rot_vec_h, this->rot_vec);
+
   // Expected values based on the initialization pattern
-  EXPECT_NEAR(this->rot_vec[rot_vec_at(0, 0, 0)], 1.7, 1e-6);
-  EXPECT_NEAR(this->rot_vec[rot_vec_at(0, 1, 0)], 3.4, 1e-6);
-  EXPECT_NEAR(this->rot_vec[rot_vec_at(1, 0, 0)], 2.1, 1e-6);
-  EXPECT_NEAR(this->rot_vec[rot_vec_at(1, 1, 0)], 4.2, 1e-6);
-  EXPECT_NEAR(this->rot_vec[rot_vec_at(2, 0, 0)], 2.2, 1e-6);
-  EXPECT_NEAR(this->rot_vec[rot_vec_at(2, 1, 0)], 4.4, 1e-6);
+  EXPECT_NEAR(rot_vec_h[rot_vec_at(0, 0, 0)], static_cast<TypeParam>(1.7),
+              1e-6);
+  EXPECT_NEAR(rot_vec_h[rot_vec_at(0, 1, 0)], static_cast<TypeParam>(3.4),
+              1e-6);
+  EXPECT_NEAR(rot_vec_h[rot_vec_at(1, 0, 0)], static_cast<TypeParam>(2.1),
+              1e-6);
+  EXPECT_NEAR(rot_vec_h[rot_vec_at(1, 1, 0)], static_cast<TypeParam>(4.2),
+              1e-6);
+  EXPECT_NEAR(rot_vec_h[rot_vec_at(2, 0, 0)], static_cast<TypeParam>(2.2),
+              1e-6);
+  EXPECT_NEAR(rot_vec_h[rot_vec_at(2, 1, 0)], static_cast<TypeParam>(4.4),
+              1e-6);
 }
 
 TYPED_TEST(HorizontalRotVertexTest, TestRotVertexRIRandom) {
@@ -286,6 +348,13 @@ TYPED_TEST(HorizontalRotVertexTest, TestRotVertexRIRandom) {
   const auto &geofac_rot_at = at<nproma, 6, nblks_v>;
   const auto &rot_vec_at = at<nproma, nlev, nblks_v>;
 
+  // Create host mirror views
+  auto vec_e_h = Kokkos::create_mirror_view(this->vec_e);
+  auto vert_edge_idx_h = Kokkos::create_mirror_view(this->vert_edge_idx);
+  auto vert_edge_blk_h = Kokkos::create_mirror_view(this->vert_edge_blk);
+  auto geofac_rot_h = Kokkos::create_mirror_view(this->geofac_rot);
+  auto rot_vec_h = Kokkos::create_mirror_view(this->rot_vec);
+
   // Set up random number generators
   std::random_device rd;
   std::mt19937 gen(rd());
@@ -295,27 +364,34 @@ TYPED_TEST(HorizontalRotVertexTest, TestRotVertexRIRandom) {
   // Initialization with random values
   for (int i = 0; i < nproma; ++i) {
     for (int k = 0; k < nlev; ++k) {
-      this->vec_e[vec_e_at(i, k, 0)] = real_distrib(gen);
+      vec_e_h(vec_e_at(i, k, 0)) = real_distrib(gen);
     }
 
     // Set random edge indices
     for (int j = 0; j < 6; ++j) {
-      this->vert_edge_idx[vert_edge_at(i, 0, j)] = int_distrib(gen);
-      this->vert_edge_blk[vert_edge_at(i, 0, j)] =
+      vert_edge_idx_h(vert_edge_at(i, 0, j)) = int_distrib(gen);
+      vert_edge_blk_h(vert_edge_at(i, 0, j)) =
           0; // Keep in same block for simplicity
     }
 
     // Random geometric factors
     for (int j = 0; j < 6; ++j) {
-      this->geofac_rot[geofac_rot_at(i, j, 0)] = real_distrib(gen);
+      geofac_rot_h(geofac_rot_at(i, j, 0)) = real_distrib(gen);
     }
 
-    // Initialize rot_vec to random values
+    // Initialize rot_vec to zero
     for (int k = 0; k < nlev; ++k) {
-      this->rot_vec[rot_vec_at(i, k, 0)] = real_distrib(gen);
+      rot_vec_h(rot_vec_at(i, k, 0)) = static_cast<TypeParam>(0.0);
     }
   }
 
+  // Copy initialized data to device
+  Kokkos::deep_copy(this->vec_e, vec_e_h);
+  Kokkos::deep_copy(this->vert_edge_idx, vert_edge_idx_h);
+  Kokkos::deep_copy(this->vert_edge_blk, vert_edge_blk_h);
+  Kokkos::deep_copy(this->geofac_rot, geofac_rot_h);
+  Kokkos::deep_copy(this->rot_vec, rot_vec_h);
+
   // Call the rot_vertex_ri function
   rot_vertex_ri<TypeParam>(
       this->vec_e.data(), this->vert_edge_idx.data(),
@@ -324,8 +400,8 @@ TYPED_TEST(HorizontalRotVertexTest, TestRotVertexRIRandom) {
       this->slev[0], this->elev[0], this->nproma, this->lacc, this->acc_async,
       this->nlev, this->nblks_e, this->nblks_v);
 
-  // Ensure computation is complete for both modes
-  Kokkos::fence();
+  // Copy results back to host for verification
+  Kokkos::deep_copy(rot_vec_h, this->rot_vec);
 
   // Calculate reference values separately and verify results
   std::vector<TypeParam> ref_rot_vec(nproma * nlev * nblks_v, 0.0);
@@ -338,30 +414,24 @@ TYPED_TEST(HorizontalRotVertexTest, TestRotVertexRIRandom) {
     for (int jk = this->slev[0]; jk < this->elev[0]; ++jk) {
       for (int jv = i_startidx; jv < i_endidx; ++jv) {
         ref_rot_vec[rot_vec_at(jv, jk, jb)] =
-            this->vec_e[vec_e_at(
-                this->vert_edge_idx[vert_edge_at(jv, jb, 0)], jk,
-                this->vert_edge_blk[vert_edge_at(jv, jb, 0)])] *
-                this->geofac_rot[geofac_rot_at(jv, 0, jb)] +
-            this->vec_e[vec_e_at(
-                this->vert_edge_idx[vert_edge_at(jv, jb, 1)], jk,
-                this->vert_edge_blk[vert_edge_at(jv, jb, 1)])] *
-                this->geofac_rot[geofac_rot_at(jv, 1, jb)] +
-            this->vec_e[vec_e_at(
-                this->vert_edge_idx[vert_edge_at(jv, jb, 2)], jk,
-                this->vert_edge_blk[vert_edge_at(jv, jb, 2)])] *
-                this->geofac_rot[geofac_rot_at(jv, 2, jb)] +
-            this->vec_e[vec_e_at(
-                this->vert_edge_idx[vert_edge_at(jv, jb, 3)], jk,
-                this->vert_edge_blk[vert_edge_at(jv, jb, 3)])] *
-                this->geofac_rot[geofac_rot_at(jv, 3, jb)] +
-            this->vec_e[vec_e_at(
-                this->vert_edge_idx[vert_edge_at(jv, jb, 4)], jk,
-                this->vert_edge_blk[vert_edge_at(jv, jb, 4)])] *
-                this->geofac_rot[geofac_rot_at(jv, 4, jb)] +
-            this->vec_e[vec_e_at(
-                this->vert_edge_idx[vert_edge_at(jv, jb, 5)], jk,
-                this->vert_edge_blk[vert_edge_at(jv, jb, 5)])] *
-                this->geofac_rot[geofac_rot_at(jv, 5, jb)];
+            vec_e_h[vec_e_at(vert_edge_idx_h[vert_edge_at(jv, jb, 0)], jk,
+                             vert_edge_blk_h[vert_edge_at(jv, jb, 0)])] *
+                geofac_rot_h[geofac_rot_at(jv, 0, jb)] +
+            vec_e_h[vec_e_at(vert_edge_idx_h[vert_edge_at(jv, jb, 1)], jk,
+                             vert_edge_blk_h[vert_edge_at(jv, jb, 1)])] *
+                geofac_rot_h[geofac_rot_at(jv, 1, jb)] +
+            vec_e_h[vec_e_at(vert_edge_idx_h[vert_edge_at(jv, jb, 2)], jk,
+                             vert_edge_blk_h[vert_edge_at(jv, jb, 2)])] *
+                geofac_rot_h[geofac_rot_at(jv, 2, jb)] +
+            vec_e_h[vec_e_at(vert_edge_idx_h[vert_edge_at(jv, jb, 3)], jk,
+                             vert_edge_blk_h[vert_edge_at(jv, jb, 3)])] *
+                geofac_rot_h[geofac_rot_at(jv, 3, jb)] +
+            vec_e_h[vec_e_at(vert_edge_idx_h[vert_edge_at(jv, jb, 4)], jk,
+                             vert_edge_blk_h[vert_edge_at(jv, jb, 4)])] *
+                geofac_rot_h[geofac_rot_at(jv, 4, jb)] +
+            vec_e_h[vec_e_at(vert_edge_idx_h[vert_edge_at(jv, jb, 5)], jk,
+                             vert_edge_blk_h[vert_edge_at(jv, jb, 5)])] *
+                geofac_rot_h[geofac_rot_at(jv, 5, jb)];
       }
     }
   }
@@ -369,10 +439,9 @@ TYPED_TEST(HorizontalRotVertexTest, TestRotVertexRIRandom) {
   // Verify results
   for (int i = 0; i < nproma; ++i) {
     for (int k = 0; k < nlev; ++k) {
-      EXPECT_NEAR(this->rot_vec[rot_vec_at(i, k, 0)],
+      EXPECT_NEAR(rot_vec_h[rot_vec_at(i, k, 0)],
                   ref_rot_vec[rot_vec_at(i, k, 0)], 1e-5)
           << "Results differ at i=" << i << ", k=" << k << ")";
     }
   }
 }
-
-- 
GitLab


From ec65397a9e1b0590a639ef693d0b61804bb81dca Mon Sep 17 00:00:00 2001
From: Pradipta Samanta <samanta@dkrz.de>
Date: Thu, 27 Mar 2025 11:59:57 +0100
Subject: [PATCH 08/34] formatted the file test_horizontal_div

---
 test/c/test_horizontal_div.cpp | 440 +++++++++++++++++++--------------
 1 file changed, 251 insertions(+), 189 deletions(-)

diff --git a/test/c/test_horizontal_div.cpp b/test/c/test_horizontal_div.cpp
index bc6ea9f..5904691 100644
--- a/test/c/test_horizontal_div.cpp
+++ b/test/c/test_horizontal_div.cpp
@@ -14,14 +14,13 @@
 #include <vector>
 
 #include <Kokkos_Core.hpp>
-#include <gtest/gtest.h>
 #include <dim_helper.hpp>
+#include <gtest/gtest.h>
 #include <horizontal/mo_lib_divrot.hpp>
 #include <support/mo_lib_loopindices.hpp>
 
 /// Test class for the horizontal divergence tests. Templated for the ValueType
-template <typename ValueType>
-class HorizontalDivTest : public ::testing::Test {
+template <typename ValueType> class HorizontalDivTest : public ::testing::Test {
 protected:
   static constexpr int nproma = 3;  // inner loop length
   static constexpr int nlev = 2;    // number of vertical levels
@@ -37,26 +36,26 @@ protected:
   std::vector<int> elev;
   bool lacc = false; // Not using ACC-specific behavior.
 
-  // Here we allocate Kokkos::View objects in a memory space that is directly accessible
-  // from both the host and device
+  // Here we allocate Kokkos::View objects in a memory space that is directly
+  // accessible from both the host and device
   using exec_space = Kokkos::DefaultExecutionSpace;
   using memory_space = exec_space::memory_space;
 
   // Views for the test data. All the data is assigned as one-dimensional arrays
-  Kokkos::View<ValueType*, memory_space> vec_e;
-  Kokkos::View<int*, memory_space> cell_edge_idx;
-  Kokkos::View<int*, memory_space> cell_edge_blk;
-  Kokkos::View<ValueType*, memory_space> geofac_div;
-  Kokkos::View<ValueType*, memory_space> div_vec_c;
-  Kokkos::View<ValueType*, memory_space> f4din;
-  Kokkos::View<ValueType*, memory_space> f4dout;
+  Kokkos::View<ValueType *, memory_space> vec_e;
+  Kokkos::View<int *, memory_space> cell_edge_idx;
+  Kokkos::View<int *, memory_space> cell_edge_blk;
+  Kokkos::View<ValueType *, memory_space> geofac_div;
+  Kokkos::View<ValueType *, memory_space> div_vec_c;
+  Kokkos::View<ValueType *, memory_space> f4din;
+  Kokkos::View<ValueType *, memory_space> f4dout;
 
   // Followings are needed in HorizontalDivAvgTest
-  Kokkos::View<int*, memory_space> cell_neighbor_idx;
-  Kokkos::View<int*, memory_space> cell_neighbor_blk;
-  Kokkos::View<ValueType*, memory_space> avg_coeff;
-  Kokkos::View<ValueType*, memory_space> opt_in2;
-  Kokkos::View<ValueType*, memory_space> opt_out2;
+  Kokkos::View<int *, memory_space> cell_neighbor_idx;
+  Kokkos::View<int *, memory_space> cell_neighbor_blk;
+  Kokkos::View<ValueType *, memory_space> avg_coeff;
+  Kokkos::View<ValueType *, memory_space> opt_in2;
+  Kokkos::View<ValueType *, memory_space> opt_out2;
 
   HorizontalDivTest()
       : vec_e("vec_e", dim_combine(nproma, nlev, nblks_e)),
@@ -70,13 +69,12 @@ protected:
         cell_neighbor_blk("cell_neighbor_blk", dim_combine(nproma, nblks_c, 3)),
         avg_coeff("avg_coeff", dim_combine(nproma, 4, nblks_c)),
         opt_in2("opt_in2", dim_combine(nproma, nlev, nblks_e)),
-        opt_out2("opt_out2", dim_combine(nproma, nlev, nblks_c))
-  {
+        opt_out2("opt_out2", dim_combine(nproma, nlev, nblks_c)) {
 
-    // We keep slev and elev as std::vector since they are small and used only on the host.
+    // We keep slev and elev as std::vector since they are small and used only
+    // on the host.
     slev.resize(dim4d, 0);
     elev.resize(dim4d, nlev); // Full vertical range (0 .. nlev-1)
-
   }
 };
 
@@ -126,8 +124,8 @@ TYPED_TEST(HorizontalDivTest, TestDiv3DSpecific) {
       div_vec_c_h[div_vec_c_at(i, k, 0)] = static_cast<TypeParam>(0.0);
     }
   }
-  // Copy the initialized data back to the device memory (or unified memory, which in some
-  // cases may be a no-op if already accessible on the host).
+  // Copy the initialized data back to the device memory (or unified memory,
+  // which in some cases may be a no-op if already accessible on the host).
   Kokkos::deep_copy(this->vec_e, vec_e_h);
   Kokkos::deep_copy(this->cell_edge_idx, cell_edge_idx_h);
   Kokkos::deep_copy(this->cell_edge_blk, cell_edge_blk_h);
@@ -144,12 +142,18 @@ TYPED_TEST(HorizontalDivTest, TestDiv3DSpecific) {
 
   Kokkos::deep_copy(div_vec_c_h, this->div_vec_c);
 
-  EXPECT_NEAR(div_vec_c_h(div_vec_c_at(0, 0, 0)), static_cast<TypeParam>(1.7), 1e-6);
-  EXPECT_NEAR(div_vec_c_h(div_vec_c_at(0, 1, 0)), static_cast<TypeParam>(3.4), 1e-6);
-  EXPECT_NEAR(div_vec_c_h(div_vec_c_at(1, 0, 0)), static_cast<TypeParam>(2.1), 1e-6);
-  EXPECT_NEAR(div_vec_c_h(div_vec_c_at(1, 1, 0)), static_cast<TypeParam>(4.2), 1e-6);
-  EXPECT_NEAR(div_vec_c_h(div_vec_c_at(2, 0, 0)), static_cast<TypeParam>(2.2), 1e-6);
-  EXPECT_NEAR(div_vec_c_h(div_vec_c_at(2, 1, 0)), static_cast<TypeParam>(4.4), 1e-6);
+  EXPECT_NEAR(div_vec_c_h(div_vec_c_at(0, 0, 0)), static_cast<TypeParam>(1.7),
+              1e-6);
+  EXPECT_NEAR(div_vec_c_h(div_vec_c_at(0, 1, 0)), static_cast<TypeParam>(3.4),
+              1e-6);
+  EXPECT_NEAR(div_vec_c_h(div_vec_c_at(1, 0, 0)), static_cast<TypeParam>(2.1),
+              1e-6);
+  EXPECT_NEAR(div_vec_c_h(div_vec_c_at(1, 1, 0)), static_cast<TypeParam>(4.2),
+              1e-6);
+  EXPECT_NEAR(div_vec_c_h(div_vec_c_at(2, 0, 0)), static_cast<TypeParam>(2.2),
+              1e-6);
+  EXPECT_NEAR(div_vec_c_h(div_vec_c_at(2, 1, 0)), static_cast<TypeParam>(4.4),
+              1e-6);
 }
 
 TYPED_TEST(HorizontalDivTest, TestDiv3DRandom) {
@@ -163,7 +167,7 @@ TYPED_TEST(HorizontalDivTest, TestDiv3DRandom) {
   const auto &geofac_div_at = at<nproma, 3, nblks_c>;
   const auto &div_vec_c_at = at<nproma, nlev, nblks_c>;
 
-  //create mirror views to store data on the host
+  // create mirror views to store data on the host
   auto vec_e_h = Kokkos::create_mirror_view(this->vec_e);
   auto cell_edge_idx_h = Kokkos::create_mirror_view(this->cell_edge_idx);
   auto cell_edge_blk_h = Kokkos::create_mirror_view(this->cell_edge_blk);
@@ -201,12 +205,12 @@ TYPED_TEST(HorizontalDivTest, TestDiv3DRandom) {
   Kokkos::deep_copy(this->geofac_div, geofac_div_h);
   Kokkos::deep_copy(this->div_vec_c, div_vec_c_h);
 
-  div3d<TypeParam>(
-      this->vec_e.data(), this->cell_edge_idx.data(), this->cell_edge_blk.data(),
-      this->geofac_div.data(), this->div_vec_c.data(), this->i_startblk,
-      this->i_endblk, this->i_startidx_in, this->i_endidx_in, this->slev[0],
-      this->elev[0], this->nproma, this->lacc, this->nlev, this->nblks_c,
-      this->nblks_e);
+  div3d<TypeParam>(this->vec_e.data(), this->cell_edge_idx.data(),
+                   this->cell_edge_blk.data(), this->geofac_div.data(),
+                   this->div_vec_c.data(), this->i_startblk, this->i_endblk,
+                   this->i_startidx_in, this->i_endidx_in, this->slev[0],
+                   this->elev[0], this->nproma, this->lacc, this->nlev,
+                   this->nblks_c, this->nblks_e);
 
   Kokkos::deep_copy(div_vec_c_h, this->div_vec_c);
 
@@ -221,17 +225,14 @@ TYPED_TEST(HorizontalDivTest, TestDiv3DRandom) {
     for (int jk = this->slev[0]; jk < this->elev[0]; ++jk) {
       for (int jc = i_startidx; jc < i_endidx; ++jc) {
         ref_div_vec_c[div_vec_c_at(jc, jk, jb)] =
-            vec_e_h[vec_e_at(
-                cell_edge_idx_h[cell_edge_at(jc, jb, 0)], jk,
-                cell_edge_blk_h[cell_edge_at(jc, jb, 0)])] *
+            vec_e_h[vec_e_at(cell_edge_idx_h[cell_edge_at(jc, jb, 0)], jk,
+                             cell_edge_blk_h[cell_edge_at(jc, jb, 0)])] *
                 geofac_div_h[geofac_div_at(jc, 0, jb)] +
-            vec_e_h[vec_e_at(
-                cell_edge_idx_h[cell_edge_at(jc, jb, 1)], jk,
-                cell_edge_blk_h[cell_edge_at(jc, jb, 1)])] *
+            vec_e_h[vec_e_at(cell_edge_idx_h[cell_edge_at(jc, jb, 1)], jk,
+                             cell_edge_blk_h[cell_edge_at(jc, jb, 1)])] *
                 geofac_div_h[geofac_div_at(jc, 1, jb)] +
-            vec_e_h[vec_e_at(
-                cell_edge_idx_h[cell_edge_at(jc, jb, 2)], jk,
-                cell_edge_blk_h[cell_edge_at(jc, jb, 2)])] *
+            vec_e_h[vec_e_at(cell_edge_idx_h[cell_edge_at(jc, jb, 2)], jk,
+                             cell_edge_blk_h[cell_edge_at(jc, jb, 2)])] *
                 geofac_div_h[geofac_div_at(jc, 2, jb)];
       }
     }
@@ -273,8 +274,10 @@ TYPED_TEST(HorizontalDivTest, TestDiv3D2FSpecific) {
   // Initialization with specific values
   for (int i = 0; i < nproma; ++i) {
     for (int k = 0; k < nlev; ++k) {
-      vec_e_h[vec_e_at(i, k, 0)] = static_cast<TypeParam>((i + 1) * (k + 1)); // Simple pattern
-      f4din_h[f4d_at(i, k, 0, 0)] = static_cast<TypeParam>((i + 1) * (k + 2)); // Different pattern for second field
+      vec_e_h[vec_e_at(i, k, 0)] =
+          static_cast<TypeParam>((i + 1) * (k + 1)); // Simple pattern
+      f4din_h[f4d_at(i, k, 0, 0)] = static_cast<TypeParam>(
+          (i + 1) * (k + 2)); // Different pattern for second field
     }
 
     // Set edge indices to point to specific cells (including self)
@@ -322,20 +325,32 @@ TYPED_TEST(HorizontalDivTest, TestDiv3D2FSpecific) {
   Kokkos::deep_copy(f4dout_h, this->f4dout);
 
   // Check first field (same as in div3d test)
-  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(0, 0, 0)], static_cast<TypeParam>(1.7), 1e-6);
-  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(0, 1, 0)], static_cast<TypeParam>(3.4), 1e-6);
-  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(1, 0, 0)], static_cast<TypeParam>(2.1), 1e-6);
-  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(1, 1, 0)], static_cast<TypeParam>(4.2), 1e-6);
-  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(2, 0, 0)], static_cast<TypeParam>(2.2), 1e-6);
-  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(2, 1, 0)], static_cast<TypeParam>(4.4), 1e-6);
+  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(0, 0, 0)], static_cast<TypeParam>(1.7),
+              1e-6);
+  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(0, 1, 0)], static_cast<TypeParam>(3.4),
+              1e-6);
+  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(1, 0, 0)], static_cast<TypeParam>(2.1),
+              1e-6);
+  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(1, 1, 0)], static_cast<TypeParam>(4.2),
+              1e-6);
+  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(2, 0, 0)], static_cast<TypeParam>(2.2),
+              1e-6);
+  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(2, 1, 0)], static_cast<TypeParam>(4.4),
+              1e-6);
 
   // Check second field (expected values calculated manually)
-  EXPECT_NEAR(f4dout_h[f4dout_at(0, 0, 0, 0)], static_cast<TypeParam>(3.4), 1e-6);
-  EXPECT_NEAR(f4dout_h[f4dout_at(0, 1, 0, 0)], static_cast<TypeParam>(5.1), 1e-6);
-  EXPECT_NEAR(f4dout_h[f4dout_at(1, 0, 0, 0)], static_cast<TypeParam>(4.2), 1e-6);
-  EXPECT_NEAR(f4dout_h[f4dout_at(1, 1, 0, 0)], static_cast<TypeParam>(6.3), 1e-6);
-  EXPECT_NEAR(f4dout_h[f4dout_at(2, 0, 0, 0)], static_cast<TypeParam>(4.4), 1e-6);
-  EXPECT_NEAR(f4dout_h[f4dout_at(2, 1, 0, 0)], static_cast<TypeParam>(6.6), 1e-6);
+  EXPECT_NEAR(f4dout_h[f4dout_at(0, 0, 0, 0)], static_cast<TypeParam>(3.4),
+              1e-6);
+  EXPECT_NEAR(f4dout_h[f4dout_at(0, 1, 0, 0)], static_cast<TypeParam>(5.1),
+              1e-6);
+  EXPECT_NEAR(f4dout_h[f4dout_at(1, 0, 0, 0)], static_cast<TypeParam>(4.2),
+              1e-6);
+  EXPECT_NEAR(f4dout_h[f4dout_at(1, 1, 0, 0)], static_cast<TypeParam>(6.3),
+              1e-6);
+  EXPECT_NEAR(f4dout_h[f4dout_at(2, 0, 0, 0)], static_cast<TypeParam>(4.4),
+              1e-6);
+  EXPECT_NEAR(f4dout_h[f4dout_at(2, 1, 0, 0)], static_cast<TypeParam>(6.6),
+              1e-6);
 }
 
 TYPED_TEST(HorizontalDivTest, TestDiv3D2FRandom) {
@@ -377,7 +392,8 @@ TYPED_TEST(HorizontalDivTest, TestDiv3D2FRandom) {
     // Set random edge indices
     for (int j = 0; j < 3; ++j) {
       cell_edge_idx_h[cell_edge_at(i, 0, j)] = int_distrib(gen);
-      cell_edge_blk_h[cell_edge_at(i, 0, j)] = 0; // Keep in same block for simplicity
+      cell_edge_blk_h[cell_edge_at(i, 0, j)] =
+          0; // Keep in same block for simplicity
     }
 
     // Random geometric factors
@@ -402,12 +418,13 @@ TYPED_TEST(HorizontalDivTest, TestDiv3D2FRandom) {
   Kokkos::deep_copy(this->f4dout, f4dout_h);
 
   // Call the div3d_2field function
-  div3d_2field<TypeParam>(
-      this->vec_e.data(), this->cell_edge_idx.data(), this->cell_edge_blk.data(),
-      this->geofac_div.data(), this->div_vec_c.data(), this->f4din.data(),
-      this->f4dout.data(), this->i_startblk, this->i_endblk, this->i_startidx_in,
-      this->i_endidx_in, this->slev[0], this->elev[0], this->nproma, this->lacc,
-      this->nlev, this->nblks_c, this->nblks_e);
+  div3d_2field<TypeParam>(this->vec_e.data(), this->cell_edge_idx.data(),
+                          this->cell_edge_blk.data(), this->geofac_div.data(),
+                          this->div_vec_c.data(), this->f4din.data(),
+                          this->f4dout.data(), this->i_startblk, this->i_endblk,
+                          this->i_startidx_in, this->i_endidx_in, this->slev[0],
+                          this->elev[0], this->nproma, this->lacc, this->nlev,
+                          this->nblks_c, this->nblks_e);
 
   // Copy results back to host for verification
   Kokkos::deep_copy(div_vec_c_h, this->div_vec_c);
@@ -426,32 +443,26 @@ TYPED_TEST(HorizontalDivTest, TestDiv3D2FRandom) {
       for (int jc = i_startidx; jc < i_endidx; ++jc) {
         // Calculate reference value for first field
         ref_div_vec_c[div_vec_c_at(jc, jk, jb)] =
-            vec_e_h[vec_e_at(
-                cell_edge_idx_h[cell_edge_at(jc, jb, 0)], jk,
-                cell_edge_blk_h[cell_edge_at(jc, jb, 0)])] *
+            vec_e_h[vec_e_at(cell_edge_idx_h[cell_edge_at(jc, jb, 0)], jk,
+                             cell_edge_blk_h[cell_edge_at(jc, jb, 0)])] *
                 geofac_div_h[geofac_div_at(jc, 0, jb)] +
-            vec_e_h[vec_e_at(
-                cell_edge_idx_h[cell_edge_at(jc, jb, 1)], jk,
-                cell_edge_blk_h[cell_edge_at(jc, jb, 1)])] *
+            vec_e_h[vec_e_at(cell_edge_idx_h[cell_edge_at(jc, jb, 1)], jk,
+                             cell_edge_blk_h[cell_edge_at(jc, jb, 1)])] *
                 geofac_div_h[geofac_div_at(jc, 1, jb)] +
-            vec_e_h[vec_e_at(
-                cell_edge_idx_h[cell_edge_at(jc, jb, 2)], jk,
-                cell_edge_blk_h[cell_edge_at(jc, jb, 2)])] *
+            vec_e_h[vec_e_at(cell_edge_idx_h[cell_edge_at(jc, jb, 2)], jk,
+                             cell_edge_blk_h[cell_edge_at(jc, jb, 2)])] *
                 geofac_div_h[geofac_div_at(jc, 2, jb)];
 
         // Calculate reference value for second field
         ref_f4dout[f4dout_at(jc, jk, jb, 0)] =
-            f4din_h[f4d_at(
-                cell_edge_idx_h[cell_edge_at(jc, jb, 0)], jk,
-                cell_edge_blk_h[cell_edge_at(jc, jb, 0)], 0)] *
+            f4din_h[f4d_at(cell_edge_idx_h[cell_edge_at(jc, jb, 0)], jk,
+                           cell_edge_blk_h[cell_edge_at(jc, jb, 0)], 0)] *
                 geofac_div_h[geofac_div_at(jc, 0, jb)] +
-            f4din_h[f4d_at(
-                cell_edge_idx_h[cell_edge_at(jc, jb, 1)], jk,
-                cell_edge_blk_h[cell_edge_at(jc, jb, 1)], 0)] *
+            f4din_h[f4d_at(cell_edge_idx_h[cell_edge_at(jc, jb, 1)], jk,
+                           cell_edge_blk_h[cell_edge_at(jc, jb, 1)], 0)] *
                 geofac_div_h[geofac_div_at(jc, 1, jb)] +
-            f4din_h[f4d_at(
-                cell_edge_idx_h[cell_edge_at(jc, jb, 2)], jk,
-                cell_edge_blk_h[cell_edge_at(jc, jb, 2)], 0)] *
+            f4din_h[f4d_at(cell_edge_idx_h[cell_edge_at(jc, jb, 2)], jk,
+                           cell_edge_blk_h[cell_edge_at(jc, jb, 2)], 0)] *
                 geofac_div_h[geofac_div_at(jc, 2, jb)];
       }
     }
@@ -500,7 +511,8 @@ TYPED_TEST(HorizontalDivTest, TestDiv4DSpecific) {
     for (int j = 0; j < 3; ++j) {
       cell_edge_idx_h[cell_edge_at(i, 0, j)] = (i + j) % nproma;
       cell_edge_blk_h[cell_edge_at(i, 0, j)] = 0;
-      geofac_div_h[geofac_div_at(i, j, 0)] = static_cast<TypeParam>(0.1 * (j + 1));
+      geofac_div_h[geofac_div_at(i, j, 0)] =
+          static_cast<TypeParam>(0.1 * (j + 1));
     }
 
     for (int k = 0; k < nlev; ++k) {
@@ -519,28 +531,40 @@ TYPED_TEST(HorizontalDivTest, TestDiv4DSpecific) {
   Kokkos::deep_copy(this->f4dout, f4dout_h);
 
   // Test function
-  div4d<TypeParam>(
-    this->cell_edge_idx.data(), this->cell_edge_blk.data(),
-    this->geofac_div.data(), this->f4din.data(), this->f4dout.data(),
-    this->dim4d, this->i_startblk, this->i_endblk, this->i_startidx_in,
-    this->i_endidx_in, this->slev.data(), this->elev.data(), this->nproma,
-    this->lacc, this->nlev, this->nblks_c, this->nblks_e);
+  div4d<TypeParam>(this->cell_edge_idx.data(), this->cell_edge_blk.data(),
+                   this->geofac_div.data(), this->f4din.data(),
+                   this->f4dout.data(), this->dim4d, this->i_startblk,
+                   this->i_endblk, this->i_startidx_in, this->i_endidx_in,
+                   this->slev.data(), this->elev.data(), this->nproma,
+                   this->lacc, this->nlev, this->nblks_c, this->nblks_e);
 
   // Copy results back to host for verification
   Kokkos::deep_copy(f4dout_h, this->f4dout);
 
-  EXPECT_NEAR(f4dout_h[f4dout_at(0, 0, 0, 0)], static_cast<TypeParam>(1.4), 1e-6);
-  EXPECT_NEAR(f4dout_h[f4dout_at(1, 0, 0, 0)], static_cast<TypeParam>(1.1), 1e-6);
-  EXPECT_NEAR(f4dout_h[f4dout_at(2, 0, 0, 0)], static_cast<TypeParam>(1.1), 1e-6);
-  EXPECT_NEAR(f4dout_h[f4dout_at(0, 1, 0, 0)], static_cast<TypeParam>(2.0), 1e-6);
-  EXPECT_NEAR(f4dout_h[f4dout_at(1, 1, 0, 0)], static_cast<TypeParam>(1.7), 1e-6);
-  EXPECT_NEAR(f4dout_h[f4dout_at(2, 1, 0, 0)], static_cast<TypeParam>(1.7), 1e-6);
-  EXPECT_NEAR(f4dout_h[f4dout_at(0, 0, 0, 1)], static_cast<TypeParam>(2.0), 1e-6);
-  EXPECT_NEAR(f4dout_h[f4dout_at(1, 0, 0, 1)], static_cast<TypeParam>(1.7), 1e-6);
-  EXPECT_NEAR(f4dout_h[f4dout_at(2, 0, 0, 1)], static_cast<TypeParam>(1.7), 1e-6);
-  EXPECT_NEAR(f4dout_h[f4dout_at(0, 1, 0, 1)], static_cast<TypeParam>(2.6), 1e-6);
-  EXPECT_NEAR(f4dout_h[f4dout_at(1, 1, 0, 1)], static_cast<TypeParam>(2.3), 1e-6);
-  EXPECT_NEAR(f4dout_h[f4dout_at(2, 1, 0, 1)], static_cast<TypeParam>(2.3), 1e-6);
+  EXPECT_NEAR(f4dout_h[f4dout_at(0, 0, 0, 0)], static_cast<TypeParam>(1.4),
+              1e-6);
+  EXPECT_NEAR(f4dout_h[f4dout_at(1, 0, 0, 0)], static_cast<TypeParam>(1.1),
+              1e-6);
+  EXPECT_NEAR(f4dout_h[f4dout_at(2, 0, 0, 0)], static_cast<TypeParam>(1.1),
+              1e-6);
+  EXPECT_NEAR(f4dout_h[f4dout_at(0, 1, 0, 0)], static_cast<TypeParam>(2.0),
+              1e-6);
+  EXPECT_NEAR(f4dout_h[f4dout_at(1, 1, 0, 0)], static_cast<TypeParam>(1.7),
+              1e-6);
+  EXPECT_NEAR(f4dout_h[f4dout_at(2, 1, 0, 0)], static_cast<TypeParam>(1.7),
+              1e-6);
+  EXPECT_NEAR(f4dout_h[f4dout_at(0, 0, 0, 1)], static_cast<TypeParam>(2.0),
+              1e-6);
+  EXPECT_NEAR(f4dout_h[f4dout_at(1, 0, 0, 1)], static_cast<TypeParam>(1.7),
+              1e-6);
+  EXPECT_NEAR(f4dout_h[f4dout_at(2, 0, 0, 1)], static_cast<TypeParam>(1.7),
+              1e-6);
+  EXPECT_NEAR(f4dout_h[f4dout_at(0, 1, 0, 1)], static_cast<TypeParam>(2.6),
+              1e-6);
+  EXPECT_NEAR(f4dout_h[f4dout_at(1, 1, 0, 1)], static_cast<TypeParam>(2.3),
+              1e-6);
+  EXPECT_NEAR(f4dout_h[f4dout_at(2, 1, 0, 1)], static_cast<TypeParam>(2.3),
+              1e-6);
 }
 
 TYPED_TEST(HorizontalDivTest, TestDiv4DRandom) {
@@ -591,12 +615,12 @@ TYPED_TEST(HorizontalDivTest, TestDiv4DRandom) {
   Kokkos::deep_copy(this->f4dout, f4dout_h);
 
   // Test function
-  div4d<TypeParam>(
-    this->cell_edge_idx.data(), this->cell_edge_blk.data(),
-    this->geofac_div.data(), this->f4din.data(), this->f4dout.data(),
-    this->dim4d, this->i_startblk, this->i_endblk, this->i_startidx_in,
-    this->i_endidx_in, this->slev.data(), this->elev.data(), this->nproma,
-    this->lacc, this->nlev, this->nblks_c, this->nblks_e);
+  div4d<TypeParam>(this->cell_edge_idx.data(), this->cell_edge_blk.data(),
+                   this->geofac_div.data(), this->f4din.data(),
+                   this->f4dout.data(), this->dim4d, this->i_startblk,
+                   this->i_endblk, this->i_startidx_in, this->i_endidx_in,
+                   this->slev.data(), this->elev.data(), this->nproma,
+                   this->lacc, this->nlev, this->nblks_c, this->nblks_e);
 
   // Copy results back to host for verification
   Kokkos::deep_copy(f4dout_h, this->f4dout);
@@ -612,11 +636,10 @@ TYPED_TEST(HorizontalDivTest, TestDiv4DRandom) {
         for (int jc = i_startidx; jc < i_endidx; ++jc) {
           TypeParam expected = 0.0;
           for (int je = 0; je < 3; ++je) {
-            expected +=
-                f4din_h[f4din_at(
-                    cell_edge_idx_h[cell_edge_at(jc, jb, je)], jk,
-                    cell_edge_blk_h[cell_edge_at(jc, jb, je)], ji)] *
-                geofac_div_h[geofac_div_at(jc, je, jb)];
+            expected += f4din_h[f4din_at(
+                            cell_edge_idx_h[cell_edge_at(jc, jb, je)], jk,
+                            cell_edge_blk_h[cell_edge_at(jc, jb, je)], ji)] *
+                        geofac_div_h[geofac_div_at(jc, je, jb)];
           }
 
           EXPECT_NEAR(f4dout_h[f4dout_at(jc, jk, jb, ji)], expected, 1e-5)
@@ -656,8 +679,10 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgSpecific) {
 
   // Create mirror views to store data on the host
   auto vec_e_h = Kokkos::create_mirror_view(this->vec_e);
-  auto cell_neighbor_idx_h = Kokkos::create_mirror_view(this->cell_neighbor_idx);
-  auto cell_neighbor_blk_h = Kokkos::create_mirror_view(this->cell_neighbor_blk);
+  auto cell_neighbor_idx_h =
+      Kokkos::create_mirror_view(this->cell_neighbor_idx);
+  auto cell_neighbor_blk_h =
+      Kokkos::create_mirror_view(this->cell_neighbor_blk);
   auto cell_edge_idx_h = Kokkos::create_mirror_view(this->cell_edge_idx);
   auto cell_edge_blk_h = Kokkos::create_mirror_view(this->cell_edge_blk);
   auto geofac_div_h = Kokkos::create_mirror_view(this->geofac_div);
@@ -669,8 +694,10 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgSpecific) {
   // Initialize the vectors with specific values
   for (int i = 0; i < nproma; ++i) {
     for (int k = 0; k < nlev; ++k) {
-      vec_e_h[vec_e_at(i, k, 0)] = static_cast<TypeParam>((i + 1) * (k + 1)); // Simple pattern
-      opt_in2_h[vec_e_at(i, k, 0)] = static_cast<TypeParam>((i + 1) * (k + 1) * 0.5); // Half of vec_e
+      vec_e_h[vec_e_at(i, k, 0)] =
+          static_cast<TypeParam>((i + 1) * (k + 1)); // Simple pattern
+      opt_in2_h[vec_e_at(i, k, 0)] =
+          static_cast<TypeParam>((i + 1) * (k + 1) * 0.5); // Half of vec_e
     }
 
     // Set edge indices to point to specific cells
@@ -696,9 +723,12 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgSpecific) {
 
     // Average coefficients
     avg_coeff_h[avg_coeff_at(i, 0, 0)] = static_cast<TypeParam>(0.4); // Self
-    avg_coeff_h[avg_coeff_at(i, 1, 0)] = static_cast<TypeParam>(0.2); // First neighbor
-    avg_coeff_h[avg_coeff_at(i, 2, 0)] = static_cast<TypeParam>(0.2); // Second neighbor
-    avg_coeff_h[avg_coeff_at(i, 3, 0)] = static_cast<TypeParam>(0.2); // Third neighbor
+    avg_coeff_h[avg_coeff_at(i, 1, 0)] =
+        static_cast<TypeParam>(0.2); // First neighbor
+    avg_coeff_h[avg_coeff_at(i, 2, 0)] =
+        static_cast<TypeParam>(0.2); // Second neighbor
+    avg_coeff_h[avg_coeff_at(i, 3, 0)] =
+        static_cast<TypeParam>(0.2); // Third neighbor
 
     // Initialize div_vec_c and opt_out2 to zero
     for (int k = 0; k < nlev; ++k) {
@@ -735,20 +765,32 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgSpecific) {
   Kokkos::deep_copy(opt_out2_h, this->opt_out2);
 
   // Verify first field results
-  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(0, 0, 0)], static_cast<TypeParam>(1.88), 1e-6);
-  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(0, 1, 0)], static_cast<TypeParam>(3.76), 1e-6);
-  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(1, 0, 0)], static_cast<TypeParam>(2.04), 1e-6);
-  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(1, 1, 0)], static_cast<TypeParam>(4.08), 1e-6);
-  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(2, 0, 0)], static_cast<TypeParam>(2.08), 1e-6);
-  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(2, 1, 0)], static_cast<TypeParam>(4.16), 1e-6);
+  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(0, 0, 0)], static_cast<TypeParam>(1.88),
+              1e-6);
+  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(0, 1, 0)], static_cast<TypeParam>(3.76),
+              1e-6);
+  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(1, 0, 0)], static_cast<TypeParam>(2.04),
+              1e-6);
+  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(1, 1, 0)], static_cast<TypeParam>(4.08),
+              1e-6);
+  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(2, 0, 0)], static_cast<TypeParam>(2.08),
+              1e-6);
+  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(2, 1, 0)], static_cast<TypeParam>(4.16),
+              1e-6);
 
   // Verify second field results
-  EXPECT_NEAR(opt_out2_h[div_vec_c_at(0, 0, 0)], static_cast<TypeParam>(0.94), 1e-6);
-  EXPECT_NEAR(opt_out2_h[div_vec_c_at(0, 1, 0)], static_cast<TypeParam>(1.88), 1e-6);
-  EXPECT_NEAR(opt_out2_h[div_vec_c_at(1, 0, 0)], static_cast<TypeParam>(1.02), 1e-6);
-  EXPECT_NEAR(opt_out2_h[div_vec_c_at(1, 1, 0)], static_cast<TypeParam>(2.04), 1e-6);
-  EXPECT_NEAR(opt_out2_h[div_vec_c_at(2, 0, 0)], static_cast<TypeParam>(1.04), 1e-6);
-  EXPECT_NEAR(opt_out2_h[div_vec_c_at(2, 1, 0)], static_cast<TypeParam>(2.08), 1e-6);
+  EXPECT_NEAR(opt_out2_h[div_vec_c_at(0, 0, 0)], static_cast<TypeParam>(0.94),
+              1e-6);
+  EXPECT_NEAR(opt_out2_h[div_vec_c_at(0, 1, 0)], static_cast<TypeParam>(1.88),
+              1e-6);
+  EXPECT_NEAR(opt_out2_h[div_vec_c_at(1, 0, 0)], static_cast<TypeParam>(1.02),
+              1e-6);
+  EXPECT_NEAR(opt_out2_h[div_vec_c_at(1, 1, 0)], static_cast<TypeParam>(2.04),
+              1e-6);
+  EXPECT_NEAR(opt_out2_h[div_vec_c_at(2, 0, 0)], static_cast<TypeParam>(1.04),
+              1e-6);
+  EXPECT_NEAR(opt_out2_h[div_vec_c_at(2, 1, 0)], static_cast<TypeParam>(2.08),
+              1e-6);
 }
 
 TYPED_TEST(HorizontalDivTest, TestDivAvgRandom) {
@@ -778,8 +820,10 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgRandom) {
 
   // Create mirror views to store data on the host
   auto vec_e_h = Kokkos::create_mirror_view(this->vec_e);
-  auto cell_neighbor_idx_h = Kokkos::create_mirror_view(this->cell_neighbor_idx);
-  auto cell_neighbor_blk_h = Kokkos::create_mirror_view(this->cell_neighbor_blk);
+  auto cell_neighbor_idx_h =
+      Kokkos::create_mirror_view(this->cell_neighbor_idx);
+  auto cell_neighbor_blk_h =
+      Kokkos::create_mirror_view(this->cell_neighbor_blk);
   auto cell_edge_idx_h = Kokkos::create_mirror_view(this->cell_edge_idx);
   auto cell_edge_blk_h = Kokkos::create_mirror_view(this->cell_edge_blk);
   auto geofac_div_h = Kokkos::create_mirror_view(this->geofac_div);
@@ -804,10 +848,12 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgRandom) {
     // Set random edge indices
     for (int j = 0; j < 3; ++j) {
       cell_edge_idx_h[cell_edge_at(i, 0, j)] = int_distrib(gen);
-      cell_edge_blk_h[cell_edge_at(i, 0, j)] = 0; // Keep in same block for simplicity
+      cell_edge_blk_h[cell_edge_at(i, 0, j)] =
+          0; // Keep in same block for simplicity
 
       cell_neighbor_idx_h[cell_neighbor_at(i, 0, j)] = int_distrib(gen);
-      cell_neighbor_blk_h[cell_neighbor_at(i, 0, j)] = 0; // Keep in same block for simplicity
+      cell_neighbor_blk_h[cell_neighbor_at(i, 0, j)] =
+          0; // Keep in same block for simplicity
     }
 
     // Random geometric factors
@@ -869,31 +915,25 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgRandom) {
     for (int jk = this->slev[0]; jk < this->elev[0]; ++jk) {
       for (int jc = i_startidx; jc < i_endidx; ++jc) {
         aux_c[div_vec_c_at(jc, jk, jb)] =
-            vec_e_h[vec_e_at(
-                cell_edge_idx_h[cell_edge_at(jc, jb, 0)], jk,
-                cell_edge_blk_h[cell_edge_at(jc, jb, 0)])] *
+            vec_e_h[vec_e_at(cell_edge_idx_h[cell_edge_at(jc, jb, 0)], jk,
+                             cell_edge_blk_h[cell_edge_at(jc, jb, 0)])] *
                 geofac_div_h[geofac_div_at(jc, 0, jb)] +
-            vec_e_h[vec_e_at(
-                cell_edge_idx_h[cell_edge_at(jc, jb, 1)], jk,
-                cell_edge_blk_h[cell_edge_at(jc, jb, 1)])] *
+            vec_e_h[vec_e_at(cell_edge_idx_h[cell_edge_at(jc, jb, 1)], jk,
+                             cell_edge_blk_h[cell_edge_at(jc, jb, 1)])] *
                 geofac_div_h[geofac_div_at(jc, 1, jb)] +
-            vec_e_h[vec_e_at(
-                cell_edge_idx_h[cell_edge_at(jc, jb, 2)], jk,
-                cell_edge_blk_h[cell_edge_at(jc, jb, 2)])] *
+            vec_e_h[vec_e_at(cell_edge_idx_h[cell_edge_at(jc, jb, 2)], jk,
+                             cell_edge_blk_h[cell_edge_at(jc, jb, 2)])] *
                 geofac_div_h[geofac_div_at(jc, 2, jb)];
 
         aux_c2[div_vec_c_at(jc, jk, jb)] =
-            opt_in2_h[vec_e_at(
-                cell_edge_idx_h[cell_edge_at(jc, jb, 0)], jk,
-                cell_edge_blk_h[cell_edge_at(jc, jb, 0)])] *
+            opt_in2_h[vec_e_at(cell_edge_idx_h[cell_edge_at(jc, jb, 0)], jk,
+                               cell_edge_blk_h[cell_edge_at(jc, jb, 0)])] *
                 geofac_div_h[geofac_div_at(jc, 0, jb)] +
-            opt_in2_h[vec_e_at(
-                cell_edge_idx_h[cell_edge_at(jc, jb, 1)], jk,
-                cell_edge_blk_h[cell_edge_at(jc, jb, 1)])] *
+            opt_in2_h[vec_e_at(cell_edge_idx_h[cell_edge_at(jc, jb, 1)], jk,
+                               cell_edge_blk_h[cell_edge_at(jc, jb, 1)])] *
                 geofac_div_h[geofac_div_at(jc, 1, jb)] +
-            opt_in2_h[vec_e_at(
-                cell_edge_idx_h[cell_edge_at(jc, jb, 2)], jk,
-                cell_edge_blk_h[cell_edge_at(jc, jb, 2)])] *
+            opt_in2_h[vec_e_at(cell_edge_idx_h[cell_edge_at(jc, jb, 2)], jk,
+                               cell_edge_blk_h[cell_edge_at(jc, jb, 2)])] *
                 geofac_div_h[geofac_div_at(jc, 2, jb)];
       }
     }
@@ -1000,8 +1040,10 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgSpecificNoL2fields) {
 
   // Create mirror views to store data on the host
   auto vec_e_h = Kokkos::create_mirror_view(this->vec_e);
-  auto cell_neighbor_idx_h = Kokkos::create_mirror_view(this->cell_neighbor_idx);
-  auto cell_neighbor_blk_h = Kokkos::create_mirror_view(this->cell_neighbor_blk);
+  auto cell_neighbor_idx_h =
+      Kokkos::create_mirror_view(this->cell_neighbor_idx);
+  auto cell_neighbor_blk_h =
+      Kokkos::create_mirror_view(this->cell_neighbor_blk);
   auto cell_edge_idx_h = Kokkos::create_mirror_view(this->cell_edge_idx);
   auto cell_edge_blk_h = Kokkos::create_mirror_view(this->cell_edge_blk);
   auto geofac_div_h = Kokkos::create_mirror_view(this->geofac_div);
@@ -1013,8 +1055,10 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgSpecificNoL2fields) {
   // Initialize the vectors with specific values
   for (int i = 0; i < nproma; ++i) {
     for (int k = 0; k < nlev; ++k) {
-      vec_e_h[vec_e_at(i, k, 0)] = static_cast<TypeParam>((i + 1) * (k + 1)); // Simple pattern
-      opt_in2_h[vec_e_at(i, k, 0)] = static_cast<TypeParam>((i + 1) * (k + 1) * 0.5); // Half of vec_e
+      vec_e_h[vec_e_at(i, k, 0)] =
+          static_cast<TypeParam>((i + 1) * (k + 1)); // Simple pattern
+      opt_in2_h[vec_e_at(i, k, 0)] =
+          static_cast<TypeParam>((i + 1) * (k + 1) * 0.5); // Half of vec_e
     }
 
     // Set edge indices to point to specific cells
@@ -1040,9 +1084,12 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgSpecificNoL2fields) {
 
     // Average coefficients
     avg_coeff_h[avg_coeff_at(i, 0, 0)] = static_cast<TypeParam>(0.4); // Self
-    avg_coeff_h[avg_coeff_at(i, 1, 0)] = static_cast<TypeParam>(0.2); // First neighbor
-    avg_coeff_h[avg_coeff_at(i, 2, 0)] = static_cast<TypeParam>(0.2); // Second neighbor
-    avg_coeff_h[avg_coeff_at(i, 3, 0)] = static_cast<TypeParam>(0.2); // Third neighbor
+    avg_coeff_h[avg_coeff_at(i, 1, 0)] =
+        static_cast<TypeParam>(0.2); // First neighbor
+    avg_coeff_h[avg_coeff_at(i, 2, 0)] =
+        static_cast<TypeParam>(0.2); // Second neighbor
+    avg_coeff_h[avg_coeff_at(i, 3, 0)] =
+        static_cast<TypeParam>(0.2); // Third neighbor
 
     // Initialize div_vec_c and opt_out2 to zero
     for (int k = 0; k < nlev; ++k) {
@@ -1079,20 +1126,32 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgSpecificNoL2fields) {
   Kokkos::deep_copy(opt_out2_h, this->opt_out2);
 
   // Verify first field results
-  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(0, 0, 0)], static_cast<TypeParam>(1.88), 1e-6);
-  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(0, 1, 0)], static_cast<TypeParam>(3.76), 1e-6);
-  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(1, 0, 0)], static_cast<TypeParam>(2.04), 1e-6);
-  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(1, 1, 0)], static_cast<TypeParam>(4.08), 1e-6);
-  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(2, 0, 0)], static_cast<TypeParam>(2.08), 1e-6);
-  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(2, 1, 0)], static_cast<TypeParam>(4.16), 1e-6);
+  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(0, 0, 0)], static_cast<TypeParam>(1.88),
+              1e-6);
+  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(0, 1, 0)], static_cast<TypeParam>(3.76),
+              1e-6);
+  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(1, 0, 0)], static_cast<TypeParam>(2.04),
+              1e-6);
+  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(1, 1, 0)], static_cast<TypeParam>(4.08),
+              1e-6);
+  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(2, 0, 0)], static_cast<TypeParam>(2.08),
+              1e-6);
+  EXPECT_NEAR(div_vec_c_h[div_vec_c_at(2, 1, 0)], static_cast<TypeParam>(4.16),
+              1e-6);
 
   // Since l2fields=false, opt_out2 should not be modified
-  EXPECT_NEAR(opt_out2_h[div_vec_c_at(0, 0, 0)], static_cast<TypeParam>(0.0), 1e-6);
-  EXPECT_NEAR(opt_out2_h[div_vec_c_at(0, 1, 0)], static_cast<TypeParam>(0.0), 1e-6);
-  EXPECT_NEAR(opt_out2_h[div_vec_c_at(1, 0, 0)], static_cast<TypeParam>(0.0), 1e-6);
-  EXPECT_NEAR(opt_out2_h[div_vec_c_at(1, 1, 0)], static_cast<TypeParam>(0.0), 1e-6);
-  EXPECT_NEAR(opt_out2_h[div_vec_c_at(2, 0, 0)], static_cast<TypeParam>(0.0), 1e-6);
-  EXPECT_NEAR(opt_out2_h[div_vec_c_at(2, 1, 0)], static_cast<TypeParam>(0.0), 1e-6);
+  EXPECT_NEAR(opt_out2_h[div_vec_c_at(0, 0, 0)], static_cast<TypeParam>(0.0),
+              1e-6);
+  EXPECT_NEAR(opt_out2_h[div_vec_c_at(0, 1, 0)], static_cast<TypeParam>(0.0),
+              1e-6);
+  EXPECT_NEAR(opt_out2_h[div_vec_c_at(1, 0, 0)], static_cast<TypeParam>(0.0),
+              1e-6);
+  EXPECT_NEAR(opt_out2_h[div_vec_c_at(1, 1, 0)], static_cast<TypeParam>(0.0),
+              1e-6);
+  EXPECT_NEAR(opt_out2_h[div_vec_c_at(2, 0, 0)], static_cast<TypeParam>(0.0),
+              1e-6);
+  EXPECT_NEAR(opt_out2_h[div_vec_c_at(2, 1, 0)], static_cast<TypeParam>(0.0),
+              1e-6);
 }
 
 TYPED_TEST(HorizontalDivTest, TestDivAvgRandomNoL2fields) {
@@ -1122,8 +1181,10 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgRandomNoL2fields) {
 
   // Create mirror views to store data on the host
   auto vec_e_h = Kokkos::create_mirror_view(this->vec_e);
-  auto cell_neighbor_idx_h = Kokkos::create_mirror_view(this->cell_neighbor_idx);
-  auto cell_neighbor_blk_h = Kokkos::create_mirror_view(this->cell_neighbor_blk);
+  auto cell_neighbor_idx_h =
+      Kokkos::create_mirror_view(this->cell_neighbor_idx);
+  auto cell_neighbor_blk_h =
+      Kokkos::create_mirror_view(this->cell_neighbor_blk);
   auto cell_edge_idx_h = Kokkos::create_mirror_view(this->cell_edge_idx);
   auto cell_edge_blk_h = Kokkos::create_mirror_view(this->cell_edge_blk);
   auto geofac_div_h = Kokkos::create_mirror_view(this->geofac_div);
@@ -1142,16 +1203,19 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgRandomNoL2fields) {
   for (int i = 0; i < nproma; ++i) {
     for (int k = 0; k < nlev; ++k) {
       vec_e_h[vec_e_at(i, k, 0)] = real_distrib(gen);
-      opt_in2_h[vec_e_at(i, k, 0)] = real_distrib(gen); // Not used but initialize anyway
+      opt_in2_h[vec_e_at(i, k, 0)] =
+          real_distrib(gen); // Not used but initialize anyway
     }
 
     // Set random edge indices
     for (int j = 0; j < 3; ++j) {
       cell_edge_idx_h[cell_edge_at(i, 0, j)] = int_distrib(gen);
-      cell_edge_blk_h[cell_edge_at(i, 0, j)] = 0; // Keep in same block for simplicity
+      cell_edge_blk_h[cell_edge_at(i, 0, j)] =
+          0; // Keep in same block for simplicity
 
       cell_neighbor_idx_h[cell_neighbor_at(i, 0, j)] = int_distrib(gen);
-      cell_neighbor_blk_h[cell_neighbor_at(i, 0, j)] = 0; // Keep in same block for simplicity
+      cell_neighbor_blk_h[cell_neighbor_at(i, 0, j)] =
+          0; // Keep in same block for simplicity
     }
 
     // Random geometric factors
@@ -1167,7 +1231,8 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgRandomNoL2fields) {
     // Random initial values for div_vec_c and opt_out2
     for (int k = 0; k < nlev; ++k) {
       div_vec_c_h[div_vec_c_at(i, k, 0)] = static_cast<TypeParam>(0.0);
-      opt_out2_h[div_vec_c_at(i, k, 0)] = static_cast<TypeParam>(0.0); // Not used but initialize anyway
+      opt_out2_h[div_vec_c_at(i, k, 0)] =
+          static_cast<TypeParam>(0.0); // Not used but initialize anyway
     }
   }
 
@@ -1211,17 +1276,14 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgRandomNoL2fields) {
     for (int jk = this->slev[0]; jk < this->elev[0]; ++jk) {
       for (int jc = i_startidx; jc < i_endidx; ++jc) {
         aux_c[div_vec_c_at(jc, jk, jb)] =
-            vec_e_h[vec_e_at(
-                cell_edge_idx_h[cell_edge_at(jc, jb, 0)], jk,
-                cell_edge_blk_h[cell_edge_at(jc, jb, 0)])] *
+            vec_e_h[vec_e_at(cell_edge_idx_h[cell_edge_at(jc, jb, 0)], jk,
+                             cell_edge_blk_h[cell_edge_at(jc, jb, 0)])] *
                 geofac_div_h[geofac_div_at(jc, 0, jb)] +
-            vec_e_h[vec_e_at(
-                cell_edge_idx_h[cell_edge_at(jc, jb, 1)], jk,
-                cell_edge_blk_h[cell_edge_at(jc, jb, 1)])] *
+            vec_e_h[vec_e_at(cell_edge_idx_h[cell_edge_at(jc, jb, 1)], jk,
+                             cell_edge_blk_h[cell_edge_at(jc, jb, 1)])] *
                 geofac_div_h[geofac_div_at(jc, 1, jb)] +
-            vec_e_h[vec_e_at(
-                cell_edge_idx_h[cell_edge_at(jc, jb, 2)], jk,
-                cell_edge_blk_h[cell_edge_at(jc, jb, 2)])] *
+            vec_e_h[vec_e_at(cell_edge_idx_h[cell_edge_at(jc, jb, 2)], jk,
+                             cell_edge_blk_h[cell_edge_at(jc, jb, 2)])] *
                 geofac_div_h[geofac_div_at(jc, 2, jb)];
       }
     }
-- 
GitLab


From 23d1ce19c7ea06e0c40513dbfbb161b7acf78f3f Mon Sep 17 00:00:00 2001
From: Pradipta Samanta <samanta@dkrz.de>
Date: Thu, 27 Mar 2025 12:13:53 +0100
Subject: [PATCH 09/34] fixed few warnings

---
 test/c/test_horizontal_recon.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/test/c/test_horizontal_recon.cpp b/test/c/test_horizontal_recon.cpp
index 089c58f..57b77e6 100644
--- a/test/c/test_horizontal_recon.cpp
+++ b/test/c/test_horizontal_recon.cpp
@@ -41,6 +41,8 @@ protected:
       return std::make_tuple(9, 5);
     case ReconstructionMethod::cubic:
       return std::make_tuple(9, 9);
+    default:
+      return std::make_tuple(0, 0); // or throw/assert if appropriate
     }
   }
 
@@ -930,9 +932,6 @@ TYPED_TEST(HorizontalReconQuadraticTest, TestLsqCellSVDRandom) {
   const auto &cell_neighbor_at = at<nproma, nblks_c, lsq_dim_c>;
   const auto &pseudoinv_at = at<nproma, lsq_dim_unk, lsq_dim_c, nblks_c>;
   const auto &p_coeff_at = at<lsq_dim_unk + 1, nproma, nlev, nblks_c>;
-  const auto &rmat_rdiag_at = at<nproma, lsq_dim_unk, nblks_c>;
-  const auto &rmat_utri_at =
-      at<nproma, (lsq_dim_unk * lsq_dim_unk - lsq_dim_unk) / 2, nblks_c>;
   const auto &moments_at = at<nproma, nblks_c, lsq_dim_unk>;
 
   // Create host mirror views
-- 
GitLab


From 58c4b6602edc5fa06c5574520a3128f622dce4d0 Mon Sep 17 00:00:00 2001
From: Pradipta Samanta <samanta@dkrz.de>
Date: Fri, 4 Apr 2025 15:22:48 +0200
Subject: [PATCH 10/34] removed unused calls of kokkos::view

---
 src/horizontal/mo_lib_divrot.cpp | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/src/horizontal/mo_lib_divrot.cpp b/src/horizontal/mo_lib_divrot.cpp
index d460211..f21da12 100644
--- a/src/horizontal/mo_lib_divrot.cpp
+++ b/src/horizontal/mo_lib_divrot.cpp
@@ -200,9 +200,6 @@ void recon_lsq_cell_q(const T *p_cc, const int *lsq_idx_c, const int *lsq_blk_c,
                        Kokkos::MemoryUnmanaged>
       UnmanagedConstInt3D;
 
-  // Kokkos::View<T ***> z_d("z_d", lsq_dim_c, nproma, nlev);
-  // Kokkos::View<T *> z_qt_times_d("z_qt_times_d", lsq_dim_unk);
-
   UnmanagedConstInt3D iidx(lsq_idx_c, nproma, nblks_c, lsq_dim_c);
   UnmanagedConstInt3D iblk(lsq_blk_c, nproma, nblks_c, lsq_dim_c);
 
@@ -366,8 +363,6 @@ void recon_lsq_cell_q_svd(const T *p_cc, const int *lsq_idx_c,
                        Kokkos::MemoryUnmanaged>
       UnmanagedConstInt3D;
 
-  Kokkos::View<T ***> z_b("z_b", lsq_dim_c, nproma, elev);
-
   UnmanagedConstInt3D iidx(lsq_idx_c, nproma, nblks_c, lsq_dim_c);
   UnmanagedConstInt3D iblk(lsq_blk_c, nproma, nblks_c, lsq_dim_c);
 
@@ -507,9 +502,6 @@ void recon_lsq_cell_c(const T *p_cc, const int *lsq_idx_c, const int *lsq_blk_c,
                        Kokkos::MemoryUnmanaged>
       UnmanagedConstInt3D;
 
-  // Kokkos::View<T ***> z_d("z_d", lsq_dim_c, nproma, elev);
-  // Kokkos::View<T *> z_qt_times_d("z_qt_times_d", 9);
-
   UnmanagedConstInt3D iidx(lsq_idx_c, nproma, nblks_c, lsq_dim_c);
   UnmanagedConstInt3D iblk(lsq_blk_c, nproma, nblks_c, lsq_dim_c);
 
@@ -753,8 +745,6 @@ void recon_lsq_cell_c_svd(const T *p_cc, const int *lsq_idx_c,
                        Kokkos::MemoryUnmanaged>
       UnmanagedConstInt3D;
 
-  // Kokkos::View<T *> z_b("z_b", 9);
-
   UnmanagedConstInt3D iidx(lsq_idx_c, nproma, nblks_c, lsq_dim_c);
   UnmanagedConstInt3D iblk(lsq_blk_c, nproma, nblks_c, lsq_dim_c);
 
-- 
GitLab


From 62fb9fffc6a9f9f5670e45d4509efa28cac853db Mon Sep 17 00:00:00 2001
From: Pradipta Samanta <samanta@dkrz.de>
Date: Fri, 4 Apr 2025 15:36:07 +0200
Subject: [PATCH 11/34] corrected the use of nproma in mo_lib_loopindices

---
 src/support/mo_lib_loopindices.cpp | 59 +++++++++++++++++++-----------
 1 file changed, 37 insertions(+), 22 deletions(-)

diff --git a/src/support/mo_lib_loopindices.cpp b/src/support/mo_lib_loopindices.cpp
index 30c82bd..8c8d318 100644
--- a/src/support/mo_lib_loopindices.cpp
+++ b/src/support/mo_lib_loopindices.cpp
@@ -12,21 +12,26 @@
 #include <algorithm> // For std::max
 
 // get_indices_c_lib function
-void get_indices_c_lib(const int i_startidx_in, const int i_endidx_in, const int nproma, 
+void get_indices_c_lib(const int i_startidx_in, const int i_endidx_in, const int nproma,
                         const int i_blk, const int i_startblk, const int i_endblk,
                         int &i_startidx_out, int &i_endidx_out, const bool called_from_cpp=true) {
-    
+
     //Since code is ported incrementally from Fortran to C++, depending on where the function is called from
     //(either fortran or c++), the first index should be either 0 or 1.
     int first_index;
-    if (called_from_cpp)
+    int nproma_loc;
+    if (called_from_cpp){
         first_index = 0;
-    else
-        first_index = 1;                   
-    
+        nproma_loc = nproma - 1;
+    }
+    else {
+        first_index = 1;
+        nproma_loc = nproma;
+    }
+
     if (i_blk == i_startblk) {
         i_startidx_out = std::max(first_index, i_startidx_in);
-        i_endidx_out = nproma;
+        i_endidx_out = nproma_loc;
         if (i_blk == i_endblk) {
             i_endidx_out = i_endidx_in;
         }
@@ -35,43 +40,53 @@ void get_indices_c_lib(const int i_startidx_in, const int i_endidx_in, const int
         i_endidx_out = i_endidx_in;
     } else {
         i_startidx_out = first_index;
-        i_endidx_out = nproma;
+        i_endidx_out = nproma_loc;
     }
 }
 
 // get_indices_e_lib function
-void get_indices_e_lib(const int i_startidx_in, const int i_endidx_in, const int nproma, 
+void get_indices_e_lib(const int i_startidx_in, const int i_endidx_in, const int nproma,
                         const int i_blk, const int i_startblk, const int i_endblk,
                         int &i_startidx_out, int &i_endidx_out, const bool called_from_cpp=true) {
-    
-    //Since code is ported incrementally from Fortran to C++, depending on where the function is called from, 
+
+    //Since code is ported incrementally from Fortran to C++, depending on where the function is called from,
     //the first index should be either 0 or 1.
     int first_index;
-    if (called_from_cpp)
+    int nproma_loc;
+    if (called_from_cpp) {
         first_index = 0;
-    else
+        nproma_loc = nproma - 1;
+    }
+    else {
         first_index = 1;
+        nproma_loc = nproma;
+    }
 
     i_startidx_out = (i_blk != i_startblk) ? first_index : std::max(first_index, i_startidx_in);
-    i_endidx_out = (i_blk != i_endblk) ? nproma : i_endidx_in;
+    i_endidx_out = (i_blk != i_endblk) ? nproma_loc : i_endidx_in;
 }
 
 // get_indices_v_lib function
-void get_indices_v_lib(const int i_startidx_in, const int i_endidx_in, const int nproma, 
+void get_indices_v_lib(const int i_startidx_in, const int i_endidx_in, const int nproma,
                         const int i_blk, const int i_startblk, const int i_endblk,
                         int &i_startidx_out, int &i_endidx_out, const bool called_from_cpp=true) {
-    
-    //Since code is ported incrementally from Fortran to C++, depending on where the function is called from, 
+
+    //Since code is ported incrementally from Fortran to C++, depending on where the function is called from,
     //the first index should be either 0 or 1.
     int first_index;
-    if (called_from_cpp)
+    int nproma_loc;
+    if (called_from_cpp) {
         first_index = 0;
-    else
+        nproma_loc = nproma - 1;
+    }
+    else {
         first_index = 1;
+        nproma_loc = nproma;
+    }
 
     if (i_blk == i_startblk) {
         i_startidx_out = i_startidx_in;
-        i_endidx_out = nproma;
+        i_endidx_out = nproma_loc;
         if (i_blk == i_endblk) {
             i_endidx_out = i_endidx_in;
         }
@@ -80,6 +95,6 @@ void get_indices_v_lib(const int i_startidx_in, const int i_endidx_in, const int
         i_endidx_out = i_endidx_in;
     } else {
         i_startidx_out = first_index;
-        i_endidx_out = nproma;
+        i_endidx_out = nproma_loc;
     }
-}
\ No newline at end of file
+}
-- 
GitLab


From 0217e3e0c9f8bf2332b96a8981df91cc38d43a6b Mon Sep 17 00:00:00 2001
From: Pradipta Samanta <samanta@dkrz.de>
Date: Fri, 4 Apr 2025 15:37:04 +0200
Subject: [PATCH 12/34] modified few c++ functions to incorporate the changes
 made in the last commit

---
 src/horizontal/mo_lib_divrot.cpp | 74 ++++++++++++++---------------
 test/c/test_horizontal_div.cpp   | 80 ++++++++++++++++----------------
 test/c/test_horizontal_recon.cpp | 43 +++++++++++------
 test/c/test_horizontal_rot.cpp   | 18 +++----
 4 files changed, 114 insertions(+), 101 deletions(-)

diff --git a/src/horizontal/mo_lib_divrot.cpp b/src/horizontal/mo_lib_divrot.cpp
index f21da12..dbbef65 100644
--- a/src/horizontal/mo_lib_divrot.cpp
+++ b/src/horizontal/mo_lib_divrot.cpp
@@ -51,13 +51,13 @@ void recon_lsq_cell_l(const T *p_cc, const int *cell_neighbor_idx,
       nblks_c);
   UnmanagedConstT3D lsq_moments_view(lsq_moments, nproma, nblks_c, lsq_dim_unk);
 
-  for (int jb = i_startblk; jb < i_endblk; ++jb) {
+  for (int jb = i_startblk; jb <= i_endblk; ++jb) {
     int i_startidx, i_endidx;
     get_indices_c_lib(i_startidx_in, i_endidx_in, nproma, jb, i_startblk,
                       i_endblk, i_startidx, i_endidx);
 
     Kokkos::MDRangePolicy<Kokkos::Rank<2>> innerPolicy({slev, i_startidx},
-                                                       {elev, i_endidx});
+                                                       {elev + 1, i_endidx + 1});
     Kokkos::parallel_for(
         "recon_lsq_cell_l_inner", innerPolicy,
         KOKKOS_LAMBDA(const int jk, const int jc) {
@@ -134,13 +134,13 @@ void recon_lsq_cell_l_svd(const T *p_cc, const int *cell_neighbor_idx,
                                        lsq_dim_c, nblks_c);
   UnmanagedConstT3D lsq_moments_view(lsq_moments, nproma, nblks_c, lsq_dim_unk);
 
-  for (int jb = i_startblk; jb < i_endblk; ++jb) {
+  for (int jb = i_startblk; jb <= i_endblk; ++jb) {
     int i_startidx, i_endidx;
     get_indices_c_lib(i_startidx_in, i_endidx_in, nproma, jb, i_startblk,
                       i_endblk, i_startidx, i_endidx);
 
     Kokkos::MDRangePolicy<Kokkos::Rank<2>> innerPolicy({slev, i_startidx},
-                                                       {elev, i_endidx});
+                                                       {elev + 1, i_endidx + 1});
     Kokkos::parallel_for(
         "recon_lsq_cell_l_svd_inner", innerPolicy,
         KOKKOS_LAMBDA(const int jk, const int jc) {
@@ -217,7 +217,7 @@ void recon_lsq_cell_q(const T *p_cc, const int *lsq_idx_c, const int *lsq_blk_c,
   if (patch_id > 0 || l_limited_area) {
     Kokkos::MDRangePolicy<Kokkos::Rank<4>> initPolicy(
         {0, i_startidx_in, slev, i_startblk},
-        {lsq_dim_unk + 1, i_endidx_in, elev, i_endblk});
+        {lsq_dim_unk + 1, i_endidx_in + 1, elev + 1, i_endblk + 1});
     Kokkos::parallel_for(
         "recon_lsq_cell_q_init", initPolicy,
         KOKKOS_LAMBDA(const int ji, const int jc, const int jk, const int jb) {
@@ -225,13 +225,13 @@ void recon_lsq_cell_q(const T *p_cc, const int *lsq_idx_c, const int *lsq_blk_c,
         });
   }
 
-  for (int jb = i_startblk; jb < i_endblk; ++jb) {
+  for (int jb = i_startblk; jb <= i_endblk; ++jb) {
     int i_startidx, i_endidx;
     get_indices_c_lib(i_startidx_in, i_endidx_in, nproma, jb, i_startblk,
                       i_endblk, i_startidx, i_endidx);
 
     Kokkos::MDRangePolicy<Kokkos::Rank<2>> innerPolicy({slev, i_startidx},
-                                                       {elev, i_endidx});
+                                                       {elev + 1, i_endidx + 1});
     Kokkos::parallel_for(
         "recon_lsq_cell_q_step1", innerPolicy,
         KOKKOS_LAMBDA(const int jk, const int jc) {
@@ -384,13 +384,13 @@ void recon_lsq_cell_q_svd(const T *p_cc, const int *lsq_idx_c,
         });
   }
 
-  for (int jb = i_startblk; jb < i_endblk; ++jb) {
+  for (int jb = i_startblk; jb <= i_endblk; ++jb) {
     int i_startidx, i_endidx;
     get_indices_c_lib(i_startidx_in, i_endidx_in, nproma, jb, i_startblk,
                       i_endblk, i_startidx, i_endidx);
 
     Kokkos::MDRangePolicy<Kokkos::Rank<2>> innerPolicy({slev, i_startidx},
-                                                       {elev, i_endidx});
+                                                       {elev + 1, i_endidx + 1});
     Kokkos::parallel_for(
         "recon_lsq_cell_q_svd_step1", innerPolicy,
         KOKKOS_LAMBDA(const int jk, const int jc) {
@@ -527,13 +527,13 @@ void recon_lsq_cell_c(const T *p_cc, const int *lsq_idx_c, const int *lsq_blk_c,
         });
   }
 
-  for (int jb = i_startblk; jb < i_endblk; ++jb) {
+  for (int jb = i_startblk; jb <= i_endblk; ++jb) {
     int i_startidx, i_endidx;
     get_indices_c_lib(i_startidx_in, i_endidx_in, nproma, jb, i_startblk,
                       i_endblk, i_startidx, i_endidx);
 
     Kokkos::MDRangePolicy<Kokkos::Rank<2>> innerPolicy({slev, i_startidx},
-                                                       {elev, i_endidx});
+                                                       {elev + 1, i_endidx + 1});
     Kokkos::parallel_for(
         "recon_lsq_cell_c_step1", innerPolicy,
         KOKKOS_LAMBDA(const int jk, const int jc) {
@@ -756,13 +756,13 @@ void recon_lsq_cell_c_svd(const T *p_cc, const int *lsq_idx_c,
   UnmanagedConstT3D lsq_moments_view(lsq_moments, nproma, nblks_c, lsq_dim_unk);
 
   if (patch_id > 0 || l_limited_area) {
-    for (int jb = i_startblk; jb < i_endblk; ++jb) {
+    for (int jb = i_startblk; jb <= i_endblk; ++jb) {
       int i_startidx, i_endidx;
       get_indices_c_lib(i_startidx_in, i_endidx_in, nproma, jb, i_startblk,
                         i_endblk, i_startidx, i_endidx);
 
       Kokkos::MDRangePolicy<Kokkos::Rank<3>> initPolicy(
-          {slev, i_startidx, 0}, {elev, i_endidx, lsq_dim_unk + 1});
+          {slev, i_startidx, 0}, {elev + 1, i_endidx + 1, lsq_dim_unk + 1});
       Kokkos::parallel_for(
           "recon_lsq_cell_c_svd_init", initPolicy,
           KOKKOS_LAMBDA(const int jk, const int jc, const int ji) {
@@ -771,13 +771,13 @@ void recon_lsq_cell_c_svd(const T *p_cc, const int *lsq_idx_c,
     }
   }
 
-  for (int jb = i_startblk; jb < i_endblk; ++jb) {
+  for (int jb = i_startblk; jb <= i_endblk; ++jb) {
     int i_startidx, i_endidx;
     get_indices_c_lib(i_startidx_in, i_endidx_in, nproma, jb, i_startblk,
                       i_endblk, i_startidx, i_endidx);
 
     Kokkos::MDRangePolicy<Kokkos::Rank<2>> innerPolicy({slev, i_startidx},
-                                                       {elev, i_endidx});
+                                                       {elev + 1, i_endidx + 1});
     Kokkos::parallel_for(
         "recon_lsq_cell_c_svd_inner", innerPolicy,
         KOKKOS_LAMBDA(const int jk, const int jc) {
@@ -932,13 +932,13 @@ void div3d(const T *vec_e, const int *cell_edge_idx, const int *cell_edge_blk,
   UnmanagedConstT3D geofac_div_view(geofac_div, nproma, 3, nblks_c);
   UnmanagedT3D div_vec_c_view(div_vec_c, nproma, nlev, nblks_c);
 
-  for (int jb = i_startblk; jb < i_endblk; ++jb) {
+  for (int jb = i_startblk; jb <= i_endblk; ++jb) {
     int i_startidx, i_endidx;
     get_indices_c_lib(i_startidx_in, i_endidx_in, nproma, jb, i_startblk,
                       i_endblk, i_startidx, i_endidx);
 
     Kokkos::MDRangePolicy<Kokkos::Rank<2>> innerPolicy({slev, i_startidx},
-                                                       {elev, i_endidx});
+                                                       {elev + 1, i_endidx + 1});
     Kokkos::parallel_for(
         "div3d_inner", innerPolicy, KOKKOS_LAMBDA(const int jk, const int jc) {
           div_vec_c_view(jc, jk, jb) =
@@ -980,13 +980,13 @@ void div3d_2field(const T *vec_e, const int *cell_edge_idx,
   UnmanagedConstT3D in2_view(in2, nproma, nlev, nblks_e);
   UnmanagedT3D out2_view(out2, nproma, nlev, nblks_c);
 
-  for (int jb = i_startblk; jb < i_endblk; ++jb) {
+  for (int jb = i_startblk; jb <= i_endblk; ++jb) {
     int i_startidx, i_endidx;
     get_indices_c_lib(i_startidx_in, i_endidx_in, nproma, jb, i_startblk,
                       i_endblk, i_startidx, i_endidx);
 
     Kokkos::MDRangePolicy<Kokkos::Rank<2>> innerPolicy({slev, i_startidx},
-                                                       {elev, i_endidx});
+                                                       {elev + 1, i_endidx + 1});
     Kokkos::parallel_for(
         "div3d_2field_inner", innerPolicy,
         KOKKOS_LAMBDA(const int jk, const int jc) {
@@ -1039,14 +1039,14 @@ void div4d(const int *cell_edge_idx, const int *cell_edge_blk,
   UnmanagedConstT4D f4din_view(f4din, nproma, nlev, nblks_e, dim4d);
   UnmanagedT4D f4dout_view(f4dout, nproma, nlev, nblks_c, dim4d);
 
-  for (int jb = i_startblk; jb < i_endblk; ++jb) {
+  for (int jb = i_startblk; jb <= i_endblk; ++jb) {
     int i_startidx, i_endidx;
     get_indices_c_lib(i_startidx_in, i_endidx_in, nproma, jb, i_startblk,
                       i_endblk, i_startidx, i_endidx);
 
     for (int ji = 0; ji < dim4d; ++ji) {
       Kokkos::MDRangePolicy<Kokkos::Rank<2>> innerPolicy({slev[ji], i_startidx},
-                                                         {elev[ji], i_endidx});
+                                                         {elev[ji] + 1, i_endidx + 1});
       Kokkos::parallel_for(
           "div4d_inner", innerPolicy,
           KOKKOS_LAMBDA(const int jk, const int jc) {
@@ -1104,13 +1104,13 @@ void div_avg(const T *vec_e, const int *cell_neighbor_idx,
   int i_endblk = i_endblk_in[0];
 
   if (l2fields) {
-    for (int jb = i_startblk; jb < i_endblk; ++jb) {
+    for (int jb = i_startblk; jb <= i_endblk; ++jb) {
       int i_startidx, i_endidx;
       get_indices_c_lib(i_startidx_in[0], i_endidx_in[0], nproma, jb,
                         i_startblk, i_endblk, i_startidx, i_endidx);
 
       Kokkos::MDRangePolicy<Kokkos::Rank<2>> innerPolicy({slev, i_startidx},
-                                                         {elev, i_endidx});
+                                                         {elev + 1, i_endidx + 1});
       Kokkos::parallel_for(
           "div_avg_step1", innerPolicy,
           KOKKOS_LAMBDA(const int jk, const int jc) {
@@ -1131,13 +1131,13 @@ void div_avg(const T *vec_e, const int *cell_neighbor_idx,
           });
     }
   } else {
-    for (int jb = i_startblk; jb < i_endblk; ++jb) {
+    for (int jb = i_startblk; jb <= i_endblk; ++jb) {
       int i_startidx, i_endidx;
       get_indices_c_lib(i_startidx_in[0], i_endidx_in[0], nproma, jb,
                         i_startblk, i_endblk, i_startidx, i_endidx);
 
       Kokkos::MDRangePolicy<Kokkos::Rank<2>> innerPolicy({slev, i_startidx},
-                                                         {elev, i_endidx});
+                                                         {elev + 1, i_endidx + 1});
       Kokkos::parallel_for(
           "div_avg_step2", innerPolicy,
           KOKKOS_LAMBDA(const int jk, const int jc) {
@@ -1156,13 +1156,13 @@ void div_avg(const T *vec_e, const int *cell_neighbor_idx,
     i_startblk = i_startblk_in[1];
     i_endblk = i_endblk_in[1];
 
-    for (int jb = i_startblk; jb < i_endblk; ++jb) {
+    for (int jb = i_startblk; jb <= i_endblk; ++jb) {
       int i_startidx, i_endidx;
       get_indices_c_lib(i_startidx_in[1], i_endidx_in[1], nproma, jb,
                         i_startblk, i_endblk, i_startidx, i_endidx);
 
       Kokkos::MDRangePolicy<Kokkos::Rank<2>> innerPolicy({slev, i_startidx},
-                                                         {elev, i_endidx});
+                                                         {elev + 1, i_endidx + 1});
       Kokkos::parallel_for(
           "div_avg_step3", innerPolicy,
           KOKKOS_LAMBDA(const int jk, const int jc) {
@@ -1171,13 +1171,13 @@ void div_avg(const T *vec_e, const int *cell_neighbor_idx,
     }
 
     if (l2fields) {
-      for (int jb = i_startblk; jb < i_endblk; ++jb) {
+      for (int jb = i_startblk; jb <= i_endblk; ++jb) {
         int i_startidx, i_endidx;
         get_indices_c_lib(i_startidx_in[1], i_endidx_in[1], nproma, jb,
                           i_startblk, i_endblk, i_startidx, i_endidx);
 
         Kokkos::MDRangePolicy<Kokkos::Rank<2>> innerPolicy({slev, i_startidx},
-                                                           {elev, i_endidx});
+                                                           {elev + 1, i_endidx + 1});
         Kokkos::parallel_for(
             "div_avg_step4", innerPolicy,
             KOKKOS_LAMBDA(const int jk, const int jc) {
@@ -1191,13 +1191,13 @@ void div_avg(const T *vec_e, const int *cell_neighbor_idx,
   i_endblk = i_endblk_in[2];
 
   if (l2fields) {
-    for (int jb = i_startblk; jb < i_endblk; ++jb) {
+    for (int jb = i_startblk; jb <= i_endblk; ++jb) {
       int i_startidx, i_endidx;
       get_indices_c_lib(i_startidx_in[2], i_endidx_in[2], nproma, jb,
                         i_startblk, i_endblk, i_startidx, i_endidx);
 
       Kokkos::MDRangePolicy<Kokkos::Rank<2>> innerPolicy({slev, i_startidx},
-                                                         {elev, i_endidx});
+                                                         {elev + 1, i_endidx + 1});
       Kokkos::parallel_for(
           "div_avg_step5", innerPolicy,
           KOKKOS_LAMBDA(const int jk, const int jc) {
@@ -1220,13 +1220,13 @@ void div_avg(const T *vec_e, const int *cell_neighbor_idx,
           });
     }
   } else {
-    for (int jb = i_startblk; jb < i_endblk; ++jb) {
+    for (int jb = i_startblk; jb <= i_endblk; ++jb) {
       int i_startidx, i_endidx;
       get_indices_c_lib(i_startidx_in[2], i_endidx_in[2], nproma, jb,
                         i_startblk, i_endblk, i_startidx, i_endidx);
 
       Kokkos::MDRangePolicy<Kokkos::Rank<2>> innerPolicy({slev, i_startidx},
-                                                         {elev, i_endidx});
+                                                         {elev + 1, i_endidx + 1});
       Kokkos::parallel_for(
           "div_avg_step6", innerPolicy,
           KOKKOS_LAMBDA(const int jk, const int jc) {
@@ -1269,13 +1269,13 @@ void rot_vertex_atmos(const T *vec_e, const int *vert_edge_idx,
 
   UnmanagedT3D rot_vec_view(rot_vec, nproma, nlev, nblks_v);
 
-  for (int jb = i_startblk; jb < i_endblk; ++jb) {
+  for (int jb = i_startblk; jb <= i_endblk; ++jb) {
     int i_startidx, i_endidx;
     get_indices_v_lib(i_startidx_in, i_endidx_in, nproma, jb, i_startblk,
                       i_endblk, i_startidx, i_endidx);
 
     Kokkos::MDRangePolicy<Kokkos::Rank<2>> innerPolicy({slev, i_startidx},
-                                                       {elev, i_endidx});
+                                                       {elev + 1, i_endidx + 1});
     Kokkos::parallel_for(
         "rot_vertex_atmos_inner", innerPolicy,
         KOKKOS_LAMBDA(const int jk, const int jv) {
@@ -1322,13 +1322,13 @@ void rot_vertex_ri(const T *vec_e, const int *vert_edge_idx,
 
   UnmanagedT3D rot_vec_view(rot_vec, nproma, nlev, nblks_v);
 
-  for (int jb = i_startblk; jb < i_endblk; ++jb) {
+  for (int jb = i_startblk; jb <= i_endblk; ++jb) {
     int i_startidx, i_endidx;
     get_indices_v_lib(i_startidx_in, i_endidx_in, nproma, jb, i_startblk,
                       i_endblk, i_startidx, i_endidx);
 
     Kokkos::MDRangePolicy<Kokkos::Rank<2>> innerPolicy({slev, i_startidx},
-                                                       {elev, i_endidx});
+                                                       {elev + 1, i_endidx + 1});
     Kokkos::parallel_for(
         "rot_vertex_atmos_inner", innerPolicy,
         KOKKOS_LAMBDA(const int jk, const int jv) {
diff --git a/test/c/test_horizontal_div.cpp b/test/c/test_horizontal_div.cpp
index 5904691..cf5f320 100644
--- a/test/c/test_horizontal_div.cpp
+++ b/test/c/test_horizontal_div.cpp
@@ -29,9 +29,9 @@ protected:
   static constexpr int dim4d = 2;   // 4th dimension size
 
   int i_startblk = 0;
-  int i_endblk = nblks_c; // Test blocks [0 .. nblks_c-1]
+  int i_endblk = nblks_c - 1; // Test blocks [0 .. nblks_c-1]
   int i_startidx_in = 0;
-  int i_endidx_in = nproma; // Full range: 0 .. nproma-1
+  int i_endidx_in = nproma - 1; // Full range: 0 .. nproma-1
   std::vector<int> slev;
   std::vector<int> elev;
   bool lacc = false; // Not using ACC-specific behavior.
@@ -74,7 +74,7 @@ protected:
     // We keep slev and elev as std::vector since they are small and used only
     // on the host.
     slev.resize(dim4d, 0);
-    elev.resize(dim4d, nlev); // Full vertical range (0 .. nlev-1)
+    elev.resize(dim4d, nlev - 1); // Full vertical range (0 .. nlev-1)
   }
 };
 
@@ -177,7 +177,7 @@ TYPED_TEST(HorizontalDivTest, TestDiv3DRandom) {
   // Initialize the arrays with random values.
   std::random_device rd;
   std::mt19937 gen(rd());
-  std::uniform_real_distribution<TypeParam> real_distrib(-10.0, 10.0);
+  std::uniform_real_distribution<TypeParam> real_distrib(-1.0, 1.0);
   std::uniform_int_distribution<int> int_distrib(0, nproma - 1);
 
   for (int i = 0; i < nproma; ++i) {
@@ -217,13 +217,13 @@ TYPED_TEST(HorizontalDivTest, TestDiv3DRandom) {
   // Calculate reference values separately and verify results
   std::vector<TypeParam> ref_div_vec_c(nproma * nlev * nblks_c, 0.0);
 
-  for (int jb = this->i_startblk; jb < this->i_endblk; ++jb) {
+  for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
     int i_startidx, i_endidx;
     get_indices_c_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb,
                       this->i_startblk, this->i_endblk, i_startidx, i_endidx);
 
-    for (int jk = this->slev[0]; jk < this->elev[0]; ++jk) {
-      for (int jc = i_startidx; jc < i_endidx; ++jc) {
+    for (int jk = this->slev[0]; jk <= this->elev[0]; ++jk) {
+      for (int jc = i_startidx; jc <= i_endidx; ++jc) {
         ref_div_vec_c[div_vec_c_at(jc, jk, jb)] =
             vec_e_h[vec_e_at(cell_edge_idx_h[cell_edge_at(jc, jb, 0)], jk,
                              cell_edge_blk_h[cell_edge_at(jc, jb, 0)])] *
@@ -380,7 +380,7 @@ TYPED_TEST(HorizontalDivTest, TestDiv3D2FRandom) {
   std::random_device rd;
   std::mt19937 gen(rd());
   std::uniform_int_distribution<int> int_distrib(0, nproma - 1);
-  std::uniform_real_distribution<TypeParam> real_distrib(-10.0, 10.0);
+  std::uniform_real_distribution<TypeParam> real_distrib(-1.0, 1.0);
 
   // Initialization with random values
   for (int i = 0; i < nproma; ++i) {
@@ -434,13 +434,13 @@ TYPED_TEST(HorizontalDivTest, TestDiv3D2FRandom) {
   std::vector<TypeParam> ref_div_vec_c(nproma * nlev * nblks_c, 0.0);
   std::vector<TypeParam> ref_f4dout(nproma * nlev * nblks_c * dim4d, 0.0);
 
-  for (int jb = this->i_startblk; jb < this->i_endblk; ++jb) {
+  for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
     int i_startidx, i_endidx;
     get_indices_c_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb,
                       this->i_startblk, this->i_endblk, i_startidx, i_endidx);
 
-    for (int jk = this->slev[0]; jk < this->elev[0]; ++jk) {
-      for (int jc = i_startidx; jc < i_endidx; ++jc) {
+    for (int jk = this->slev[0]; jk <= this->elev[0]; ++jk) {
+      for (int jc = i_startidx; jc <= i_endidx; ++jc) {
         // Calculate reference value for first field
         ref_div_vec_c[div_vec_c_at(jc, jk, jb)] =
             vec_e_h[vec_e_at(cell_edge_idx_h[cell_edge_at(jc, jb, 0)], jk,
@@ -626,14 +626,14 @@ TYPED_TEST(HorizontalDivTest, TestDiv4DRandom) {
   Kokkos::deep_copy(f4dout_h, this->f4dout);
 
   // Compute reference result and check
-  for (int jb = this->i_startblk; jb < this->i_endblk; ++jb) {
+  for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
     int i_startidx, i_endidx;
     get_indices_c_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb,
                       this->i_startblk, this->i_endblk, i_startidx, i_endidx);
 
     for (int ji = 0; ji < dim4d; ++ji) {
-      for (int jk = this->slev[ji]; jk < this->elev[ji]; ++jk) {
-        for (int jc = i_startidx; jc < i_endidx; ++jc) {
+      for (int jk = this->slev[ji]; jk <= this->elev[ji]; ++jk) {
+        for (int jc = i_startidx; jc <= i_endidx; ++jc) {
           TypeParam expected = 0.0;
           for (int je = 0; je < 3; ++je) {
             expected += f4din_h[f4din_at(
@@ -668,9 +668,9 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgSpecific) {
   // Vectors for additional parameters
   // Vectors for block and index ranges
   std::vector<int> i_startblk_in(3, 0);
-  std::vector<int> i_endblk_in(3, nblks_c);
+  std::vector<int> i_endblk_in(3, nblks_c - 1);
   std::vector<int> i_startidx_in(3, 0);
-  std::vector<int> i_endidx_in(3, nproma);
+  std::vector<int> i_endidx_in(3, nproma - 1);
 
   // Parameters for the test
   int patch_id = 1;
@@ -806,9 +806,9 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgRandom) {
 
   // Vectors for block and index ranges
   std::vector<int> i_startblk_in(3, 0);
-  std::vector<int> i_endblk_in(3, nblks_c);
+  std::vector<int> i_endblk_in(3, nblks_c - 1);
   std::vector<int> i_startidx_in(3, 0);
-  std::vector<int> i_endidx_in(3, nproma);
+  std::vector<int> i_endidx_in(3, nproma - 1);
 
   // Parameters for the test
   int patch_id = 1;
@@ -907,13 +907,13 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgRandom) {
   std::vector<TypeParam> ref_opt_out2(dim_combine(nproma, nlev, nblks_c));
 
   // Step 1: Calculate aux_c and aux_c2
-  for (int jb = i_startblk_in[0]; jb < i_endblk_in[0]; ++jb) {
+  for (int jb = i_startblk_in[0]; jb <= i_endblk_in[0]; ++jb) {
     int i_startidx, i_endidx;
     get_indices_c_lib(i_startidx_in[0], i_endidx_in[0], nproma, jb,
                       i_startblk_in[0], i_endblk_in[0], i_startidx, i_endidx);
 
-    for (int jk = this->slev[0]; jk < this->elev[0]; ++jk) {
-      for (int jc = i_startidx; jc < i_endidx; ++jc) {
+    for (int jk = this->slev[0]; jk <= this->elev[0]; ++jk) {
+      for (int jc = i_startidx; jc <= i_endidx; ++jc) {
         aux_c[div_vec_c_at(jc, jk, jb)] =
             vec_e_h[vec_e_at(cell_edge_idx_h[cell_edge_at(jc, jb, 0)], jk,
                              cell_edge_blk_h[cell_edge_at(jc, jb, 0)])] *
@@ -940,13 +940,13 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgRandom) {
   }
 
   // Step 2: Assign aux_c to div_vec_c and aux_c2 to opt_out2 for patch_id > 0
-  for (int jb = i_startblk_in[1]; jb < i_endblk_in[1]; ++jb) {
+  for (int jb = i_startblk_in[1]; jb <= i_endblk_in[1]; ++jb) {
     int i_startidx, i_endidx;
     get_indices_c_lib(i_startidx_in[1], i_endidx_in[1], nproma, jb,
                       i_startblk_in[1], i_endblk_in[1], i_startidx, i_endidx);
 
-    for (int jk = this->slev[0]; jk < this->elev[0]; ++jk) {
-      for (int jc = i_startidx; jc < i_endidx; ++jc) {
+    for (int jk = this->slev[0]; jk <= this->elev[0]; ++jk) {
+      for (int jc = i_startidx; jc <= i_endidx; ++jc) {
         ref_div_vec_c[div_vec_c_at(jc, jk, jb)] =
             aux_c[div_vec_c_at(jc, jk, jb)];
         ref_opt_out2[div_vec_c_at(jc, jk, jb)] =
@@ -956,13 +956,13 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgRandom) {
   }
 
   // Step 3: Perform averaging for the rest of the blocks
-  for (int jb = i_startblk_in[2]; jb < i_endblk_in[2]; ++jb) {
+  for (int jb = i_startblk_in[2]; jb <= i_endblk_in[2]; ++jb) {
     int i_startidx, i_endidx;
     get_indices_c_lib(i_startidx_in[2], i_endidx_in[2], nproma, jb,
                       i_startblk_in[2], i_endblk_in[2], i_startidx, i_endidx);
 
-    for (int jk = this->slev[0]; jk < this->elev[0]; ++jk) {
-      for (int jc = i_startidx; jc < i_endidx; ++jc) {
+    for (int jk = this->slev[0]; jk <= this->elev[0]; ++jk) {
+      for (int jc = i_startidx; jc <= i_endidx; ++jc) {
         ref_div_vec_c[div_vec_c_at(jc, jk, jb)] =
             aux_c[div_vec_c_at(jc, jk, jb)] *
                 avg_coeff_h[avg_coeff_at(jc, 0, jb)] +
@@ -1026,9 +1026,9 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgSpecificNoL2fields) {
 
   // Vectors for block and index ranges
   std::vector<int> i_startblk_in(3, 0);
-  std::vector<int> i_endblk_in(3, nblks_c);
+  std::vector<int> i_endblk_in(3, nblks_c - 1);
   std::vector<int> i_startidx_in(3, 0);
-  std::vector<int> i_endidx_in(3, nproma);
+  std::vector<int> i_endidx_in(3, nproma - 1);
 
   // Parameters for the test
   int patch_id = 1;
@@ -1167,9 +1167,9 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgRandomNoL2fields) {
 
   // Vectors for block and index ranges
   std::vector<int> i_startblk_in(3, 0);
-  std::vector<int> i_endblk_in(3, nblks_c);
+  std::vector<int> i_endblk_in(3, nblks_c - 1);
   std::vector<int> i_startidx_in(3, 0);
-  std::vector<int> i_endidx_in(3, nproma);
+  std::vector<int> i_endidx_in(3, nproma - 1);
 
   // Parameters for the test
   int patch_id = 1;
@@ -1268,13 +1268,13 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgRandomNoL2fields) {
   std::vector<TypeParam> ref_div_vec_c(dim_combine(nproma, nlev, nblks_c));
 
   // Step 1: Calculate aux_c (but not aux_c2 since l2fields=false)
-  for (int jb = i_startblk_in[0]; jb < i_endblk_in[0]; ++jb) {
+  for (int jb = i_startblk_in[0]; jb <= i_endblk_in[0]; ++jb) {
     int i_startidx, i_endidx;
     get_indices_c_lib(i_startidx_in[0], i_endidx_in[0], nproma, jb,
                       i_startblk_in[0], i_endblk_in[0], i_startidx, i_endidx);
 
-    for (int jk = this->slev[0]; jk < this->elev[0]; ++jk) {
-      for (int jc = i_startidx; jc < i_endidx; ++jc) {
+    for (int jk = this->slev[0]; jk <= this->elev[0]; ++jk) {
+      for (int jc = i_startidx; jc <= i_endidx; ++jc) {
         aux_c[div_vec_c_at(jc, jk, jb)] =
             vec_e_h[vec_e_at(cell_edge_idx_h[cell_edge_at(jc, jb, 0)], jk,
                              cell_edge_blk_h[cell_edge_at(jc, jb, 0)])] *
@@ -1291,13 +1291,13 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgRandomNoL2fields) {
 
   // Step 2: Assign aux_c to div_vec_c for patch_id > 0 (opt_out2 not updated
   // since l2fields=false)
-  for (int jb = i_startblk_in[1]; jb < i_endblk_in[1]; ++jb) {
+  for (int jb = i_startblk_in[1]; jb <= i_endblk_in[1]; ++jb) {
     int i_startidx, i_endidx;
     get_indices_c_lib(i_startidx_in[1], i_endidx_in[1], nproma, jb,
                       i_startblk_in[1], i_endblk_in[1], i_startidx, i_endidx);
 
-    for (int jk = this->slev[0]; jk < this->elev[0]; ++jk) {
-      for (int jc = i_startidx; jc < i_endidx; ++jc) {
+    for (int jk = this->slev[0]; jk <= this->elev[0]; ++jk) {
+      for (int jc = i_startidx; jc <= i_endidx; ++jc) {
         ref_div_vec_c[div_vec_c_at(jc, jk, jb)] =
             aux_c[div_vec_c_at(jc, jk, jb)];
       }
@@ -1306,13 +1306,13 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgRandomNoL2fields) {
 
   // Step 3: Perform averaging for the rest of the blocks (only for div_vec_c,
   // not opt_out2)
-  for (int jb = i_startblk_in[2]; jb < i_endblk_in[2]; ++jb) {
+  for (int jb = i_startblk_in[2]; jb <= i_endblk_in[2]; ++jb) {
     int i_startidx, i_endidx;
     get_indices_c_lib(i_startidx_in[2], i_endidx_in[2], nproma, jb,
                       i_startblk_in[2], i_endblk_in[2], i_startidx, i_endidx);
 
-    for (int jk = this->slev[0]; jk < this->elev[0]; ++jk) {
-      for (int jc = i_startidx; jc < i_endidx; ++jc) {
+    for (int jk = this->slev[0]; jk <= this->elev[0]; ++jk) {
+      for (int jc = i_startidx; jc <= i_endidx; ++jc) {
         ref_div_vec_c[div_vec_c_at(jc, jk, jb)] =
             aux_c[div_vec_c_at(jc, jk, jb)] *
                 avg_coeff_h[avg_coeff_at(jc, 0, jb)] +
diff --git a/test/c/test_horizontal_recon.cpp b/test/c/test_horizontal_recon.cpp
index 57b77e6..d8ea721 100644
--- a/test/c/test_horizontal_recon.cpp
+++ b/test/c/test_horizontal_recon.cpp
@@ -48,8 +48,8 @@ protected:
 
   // Constant dimensions.
   static constexpr int nproma = 3;  // inner loop length
-  static constexpr int nlev = 1;    // number of vertical levels
-  static constexpr int nblks_c = 1; // number of cell blocks (for p_e_in)
+  static constexpr int nlev = 2;    // number of vertical levels
+  static constexpr int nblks_c = 2; // number of cell blocks (for p_e_in)
   static constexpr std::tuple<int, int> lsq_dim =
       init_lsq_dim(static_cast<ReconstructionMethod>(ReconMethod));
   static constexpr int lsq_dim_c = std::get<0>(lsq_dim);
@@ -57,11 +57,11 @@ protected:
 
   // Parameter values.
   int i_startblk = 0;
-  int i_endblk = nblks_c; // Test blocks [0 .. nblks_c-1]
+  int i_endblk = nblks_c - 1; // Test blocks [0 .. nblks_c-1]
   int i_startidx_in = 0;
-  int i_endidx_in = nproma; // Full range: 0 .. nproma-1
+  int i_endidx_in = nproma - 1; // Full range: 0 .. nproma-1
   int slev = 0;
-  int elev = nlev; // Full vertical range (0 .. nlev-1)
+  int elev = nlev - 1; // Full vertical range (0 .. nlev-1)
   int patch_id = 0;
   bool lacc = false;          // Not using ACC-specific behavior.
   bool acc_async = false;     // No asynchronous execution.
@@ -252,7 +252,7 @@ TYPED_TEST(HorizontalReconLinearTest, TestLsqCellRandom) {
   std::random_device rd;
   std::mt19937 gen(rd());
   std::uniform_int_distribution<int> int_distrib(0, nproma - 1);
-  std::uniform_real_distribution<TypeParam> real_distrib(0.0, 3.0);
+  std::uniform_real_distribution<TypeParam> real_distrib(0.0, 1.0);
 
   // Initialization
   for (int i = 0; i < nproma; ++i) {
@@ -304,13 +304,15 @@ TYPED_TEST(HorizontalReconLinearTest, TestLsqCellRandom) {
   std::vector<TypeParam> z_d(lsq_dim_c);
   std::vector<TypeParam> z_qt_times_d(lsq_dim_unk);
   std::vector<TypeParam> p_result((lsq_dim_unk + 1) * nproma);
-
+ 
+  // doing the calculation only for jb=0
   for (int jb = this->i_startblk; jb < this->i_endblk; ++jb) {
     int i_startidx, i_endidx;
     get_indices_c_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb,
                       this->i_startblk, this->i_endblk, i_startidx, i_endidx);
+    // calculation only for jk = 0
     for (int jk = this->slev; jk < this->elev; ++jk) {
-      for (int jc = i_startidx; jc < i_endidx; ++jc) {
+      for (int jc = i_startidx; jc <= i_endidx; ++jc) {
         for (int i = 0; i < lsq_dim_c; ++i) {
           z_d[i] = p_cc_h[p_cc_at(
                        cell_neighbor_idx_h[cell_neighbor_at(jc, jb, i)], jk,
@@ -453,7 +455,7 @@ TYPED_TEST(HorizontalReconLinearTest, TestLsqCellSVDRandom) {
   std::random_device rd;
   std::mt19937 gen(rd());
   std::uniform_int_distribution<int> int_distrib(0, nproma - 1);
-  std::uniform_real_distribution<TypeParam> real_distrib(0.0, 3.0);
+  std::uniform_real_distribution<TypeParam> real_distrib(0.0, 1.0);
 
   // Initialization
   for (int i = 0; i < nproma; ++i) {
@@ -497,12 +499,14 @@ TYPED_TEST(HorizontalReconLinearTest, TestLsqCellSVDRandom) {
   // Compute reference result
   std::vector<TypeParam> z_d(lsq_dim_c);
   std::vector<TypeParam> p_result((lsq_dim_unk + 1) * nproma);
+  // doing the calculation only for jb=0
   for (int jb = this->i_startblk; jb < this->i_endblk; ++jb) {
     int i_startidx, i_endidx;
     get_indices_c_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb,
                       this->i_startblk, this->i_endblk, i_startidx, i_endidx);
+    // calculation only for jk = 0
     for (int jk = this->slev; jk < this->elev; ++jk) {
-      for (int jc = i_startidx; jc < i_endidx; ++jc) {
+      for (int jc = i_startidx; jc <= i_endidx; ++jc) {
         for (int i = 0; i < lsq_dim_c; ++i) {
           z_d[i] = p_cc_h[p_cc_at(
                        cell_neighbor_idx_h[cell_neighbor_at(jc, jb, i)], jk,
@@ -746,6 +750,7 @@ TYPED_TEST(HorizontalReconQuadraticTest, TestLsqCellRandom) {
   Kokkos::View<TypeParam **, host_space> p_result_h("p_result_h",
                                                     lsq_dim_unk + 1, nproma);
 
+  // calculating only for jb=0
   for (int jb = this->i_startblk; jb < this->i_endblk; ++jb) {
     int i_startidx, i_endidx;
     get_indices_c_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb,
@@ -753,8 +758,9 @@ TYPED_TEST(HorizontalReconQuadraticTest, TestLsqCellRandom) {
 
     // Step 1: Calculate z_d values (matches the "recon_lsq_cell_q_step1"
     // parallel_for)
+    // calculating only for jk = 0
     for (int jk = this->slev; jk < this->elev; ++jk) {
-      for (int jc = i_startidx; jc < i_endidx; ++jc) {
+      for (int jc = i_startidx; jc <= i_endidx; ++jc) {
         for (int i = 0; i < lsq_dim_c; ++i) {
           z_d_h(i, jc, jk) =
               p_cc_h[p_cc_at(
@@ -767,8 +773,9 @@ TYPED_TEST(HorizontalReconQuadraticTest, TestLsqCellRandom) {
 
     // Step 2: Calculate coefficients (matches the "recon_lsq_cell_q_step2"
     // parallel_for)
+    // calculating only for jk = 0
     for (int jk = this->slev; jk < this->elev; ++jk) {
-      for (int jc = i_startidx; jc < i_endidx; ++jc) {
+      for (int jc = i_startidx; jc <= i_endidx; ++jc) {
         // Matrix multiplication (Q^T * d)
         for (int j = 0; j < lsq_dim_unk; ++j) {
           z_qt_times_d_h(j) = 0.0;
@@ -995,12 +1002,14 @@ TYPED_TEST(HorizontalReconQuadraticTest, TestLsqCellSVDRandom) {
   // Compute reference result
   std::vector<TypeParam> z_d(lsq_dim_c);
   std::vector<TypeParam> p_result((lsq_dim_unk + 1) * nproma);
+  // calculating only for jb=0
   for (int jb = this->i_startblk; jb < this->i_endblk; ++jb) {
     int i_startidx, i_endidx;
     get_indices_c_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb,
                       this->i_startblk, this->i_endblk, i_startidx, i_endidx);
+    // calculating only for jk = 0
     for (int jk = this->slev; jk < this->elev; ++jk) {
-      for (int jc = i_startidx; jc < i_endidx; ++jc) {
+      for (int jc = i_startidx; jc <= i_endidx; ++jc) {
         for (int i = 0; i < lsq_dim_c; ++i) {
           z_d[i] = p_cc_h[p_cc_at(
                        cell_neighbor_idx_h[cell_neighbor_at(jc, jb, i)], jk,
@@ -1255,12 +1264,14 @@ TYPED_TEST(HorizontalReconCubicTest, TestLsqCellRandom) {
   std::vector<TypeParam> z_d(lsq_dim_c);
   std::vector<TypeParam> z_qt_times_d(lsq_dim_unk);
   std::vector<TypeParam> p_result((lsq_dim_unk + 1) * nproma);
+  // calculating only for jb=0
   for (int jb = this->i_startblk; jb < this->i_endblk; ++jb) {
     int i_startidx, i_endidx;
     get_indices_c_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb,
                       this->i_startblk, this->i_endblk, i_startidx, i_endidx);
+    // calculating only for jk = 0
     for (int jk = this->slev; jk < this->elev; ++jk) {
-      for (int jc = i_startidx; jc < i_endidx; ++jc) {
+      for (int jc = i_startidx; jc <= i_endidx; ++jc) {
         for (int i = 0; i < lsq_dim_c; ++i) {
           z_d[i] = p_cc_h[p_cc_at(
                        cell_neighbor_idx_h[cell_neighbor_at(jc, jb, i)], jk,
@@ -1494,12 +1505,14 @@ TYPED_TEST(HorizontalReconCubicTest, TestLsqCellSVDRandom) {
   std::vector<TypeParam> z_qt_times_d(lsq_dim_unk);
   std::vector<TypeParam> p_result((lsq_dim_unk + 1) * nproma);
 
+  // calculating only for jb=0
   for (int jb = this->i_startblk; jb < this->i_endblk; ++jb) {
     int i_startidx, i_endidx;
     get_indices_c_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb,
                       this->i_startblk, this->i_endblk, i_startidx, i_endidx);
+    // calculating only for jk = 0
     for (int jk = this->slev; jk < this->elev; ++jk) {
-      for (int jc = i_startidx; jc < i_endidx; ++jc) {
+      for (int jc = i_startidx; jc <= i_endidx; ++jc) {
         for (int i = 0; i < lsq_dim_c; ++i) {
           z_d[i] = p_cc_h[p_cc_at(
                        cell_neighbor_idx_h[cell_neighbor_at(jc, jb, i)], jk,
diff --git a/test/c/test_horizontal_rot.cpp b/test/c/test_horizontal_rot.cpp
index 2c8fc46..92100e3 100644
--- a/test/c/test_horizontal_rot.cpp
+++ b/test/c/test_horizontal_rot.cpp
@@ -30,9 +30,9 @@ protected:
   static constexpr int dim4d = 2;   // 4th dimension size
 
   int i_startblk = 0;
-  int i_endblk = nblks_v; // Test blocks [0 .. nblks_v-1]
+  int i_endblk = nblks_v - 1; // Test blocks [0 .. nblks_v-1]
   int i_startidx_in = 0;
-  int i_endidx_in = nproma; // Full range: 0 .. nproma-1
+  int i_endidx_in = nproma - 1; // Full range: 0 .. nproma-1
   std::vector<int> slev;
   std::vector<int> elev;
   bool lacc = false;      // Not using ACC-specific behavior.
@@ -62,7 +62,7 @@ protected:
     // We keep slev and elev as std::vector since they are small and used only
     // on the host.
     slev.resize(dim4d, 0);
-    elev.resize(dim4d, nlev); // Full vertical range (0 .. nlev-1)
+    elev.resize(dim4d, nlev - 1); // Full vertical range (0 .. nlev-1)
   }
 };
 
@@ -217,13 +217,13 @@ TYPED_TEST(HorizontalRotVertexTest, TestRotVertexAtmosRandom) {
   // Calculate reference values separately and verify results
   std::vector<TypeParam> ref_rot_vec(nproma * nlev * nblks_v, 0.0);
 
-  for (int jb = this->i_startblk; jb < this->i_endblk; ++jb) {
+  for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
     int i_startidx, i_endidx;
     get_indices_v_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb,
                       this->i_startblk, this->i_endblk, i_startidx, i_endidx);
 
-    for (int jk = this->slev[0]; jk < this->elev[0]; ++jk) {
-      for (int jv = i_startidx; jv < i_endidx; ++jv) {
+    for (int jk = this->slev[0]; jk <= this->elev[0]; ++jk) {
+      for (int jv = i_startidx; jv <= i_endidx; ++jv) {
         ref_rot_vec[rot_vec_at(jv, jk, jb)] =
             vec_e_h[vec_e_at(vert_edge_idx_h[vert_edge_at(jv, jb, 0)], jk,
                              vert_edge_blk_h[vert_edge_at(jv, jb, 0)])] *
@@ -406,13 +406,13 @@ TYPED_TEST(HorizontalRotVertexTest, TestRotVertexRIRandom) {
   // Calculate reference values separately and verify results
   std::vector<TypeParam> ref_rot_vec(nproma * nlev * nblks_v, 0.0);
 
-  for (int jb = this->i_startblk; jb < this->i_endblk; ++jb) {
+  for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
     int i_startidx, i_endidx;
     get_indices_v_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb,
                       this->i_startblk, this->i_endblk, i_startidx, i_endidx);
 
-    for (int jk = this->slev[0]; jk < this->elev[0]; ++jk) {
-      for (int jv = i_startidx; jv < i_endidx; ++jv) {
+    for (int jk = this->slev[0]; jk <= this->elev[0]; ++jk) {
+      for (int jv = i_startidx; jv <= i_endidx; ++jv) {
         ref_rot_vec[rot_vec_at(jv, jk, jb)] =
             vec_e_h[vec_e_at(vert_edge_idx_h[vert_edge_at(jv, jb, 0)], jk,
                              vert_edge_blk_h[vert_edge_at(jv, jb, 0)])] *
-- 
GitLab


From 5500a3b161f7a104feab233f3c7a488ec3b16c68 Mon Sep 17 00:00:00 2001
From: Pradipta Samanta <samanta@dkrz.de>
Date: Fri, 4 Apr 2025 16:48:00 +0200
Subject: [PATCH 13/34] modified the unit tests for interpolation_vector

---
 test/c/CMakeLists.txt                |   2 +-
 test/c/test_interpolation_vector.cpp | 458 +++++++++++++++++++++------
 2 files changed, 369 insertions(+), 91 deletions(-)

diff --git a/test/c/CMakeLists.txt b/test/c/CMakeLists.txt
index 2dd32f4..9d21819 100644
--- a/test/c/CMakeLists.txt
+++ b/test/c/CMakeLists.txt
@@ -36,7 +36,7 @@ set(SOURCES
   test_horizontal_recon.cpp
   test_horizontal_rot.cpp
   # test_tdma_solver.cpp
-  # test_interpolation_vector.cpp
+  test_interpolation_vector.cpp
   # test_intp_rbf.cpp
   # test_interpolation_scalar.cpp
 )
diff --git a/test/c/test_interpolation_vector.cpp b/test/c/test_interpolation_vector.cpp
index 680fb6e..fbab6ca 100644
--- a/test/c/test_interpolation_vector.cpp
+++ b/test/c/test_interpolation_vector.cpp
@@ -12,104 +12,382 @@
 #include <Kokkos_Core.hpp>
 #include <gtest/gtest.h>
 #include <vector>
+#include <random>
 
 #include "mo_lib_interpolation_vector.hpp"
+#include "dim_helper.hpp"
 
-// Dimensions for the test (small, trivial test).
-// We assume Fortran ordering: column-major, but our C wrappers will wrap raw
-// pointers into Kokkos::Views with LayoutLeft.
-constexpr int nproma = 2;
-constexpr int nlev = 3;
-constexpr int nblks_e = 2; // For the edge arrays (p_vn_in, p_vt_in)
-constexpr int nblks_c = 2; // For the cell arrays and interpolation coefficients
-
-// For the get_indices_c_lib inputs.
-constexpr int i_startblk = 0;
-constexpr int i_endblk = 1; // two blocks: indices 0 and 1
-constexpr int i_startidx_in = 0;
-constexpr int i_endidx_in = nproma - 1; // 0 and 1
-constexpr int slev = 0;
-constexpr int elev = nlev - 1; // 0 .. 2
-
-// Helper to compute total number of elements for a 3D array stored in
-// column-major order.
-template <typename T> size_t num_elements(int dim1, int dim2, int dim3) {
-  return static_cast<size_t>(dim1) * dim2 * dim3;
-}
+/// Base test class for the edges2cells tests. Templated for the ValueType.
+template <typename ValueType>
+class Edges2CellsVectorTest : public ::testing::Test {
+protected:
+  // Constant dimensions
+  static constexpr int nproma = 2;    // inner loop length
+  static constexpr int nlev = 3;      // number of vertical levels
+  static constexpr int nblks_e = 2;   // number of edge blocks
+  static constexpr int nblks_c = 2;   // number of cell blocks
+  static constexpr int num_edges = 3; // number of edges per cell
+
+  // Parameter values
+  int i_startblk = 0;
+  int i_endblk = nblks_c - 1; // Test blocks [0 .. nblks_c-1]
+  int i_startidx_in = 0;
+  int i_endidx_in = nproma - 1; // Full range: 0 .. nproma-1
+  int slev = 0;
+  int elev = nlev - 1; // Full vertical range (0 .. nlev-1)
+
+  // Using Kokkos execution and memory spaces
+  using exec_space = Kokkos::DefaultExecutionSpace;
+  using memory_space = exec_space::memory_space;
+  
+  // Kokkos Views for test data
+  Kokkos::View<ValueType*, memory_space> p_vn_in;
+  Kokkos::View<ValueType*, memory_space> p_vt_in;
+  Kokkos::View<int*, memory_space> cell_edge_idx;
+  Kokkos::View<int*, memory_space> cell_edge_blk;
+  Kokkos::View<ValueType*, memory_space> e_bln_c_u;
+  Kokkos::View<ValueType*, memory_space> e_bln_c_v;
+  Kokkos::View<ValueType*, memory_space> p_u_out;
+  Kokkos::View<ValueType*, memory_space> p_v_out;
+
+  Edges2CellsVectorTest() 
+      : p_vn_in("p_vn_in", dim_combine(nproma, nlev, nblks_e)),
+        p_vt_in("p_vt_in", dim_combine(nproma, nlev, nblks_e)),
+        cell_edge_idx("cell_edge_idx", dim_combine(nproma, nblks_c, num_edges)),
+        cell_edge_blk("cell_edge_blk", dim_combine(nproma, nblks_c, num_edges)),
+        e_bln_c_u("e_bln_c_u", dim_combine(nproma, 6, nblks_c)),
+        e_bln_c_v("e_bln_c_v", dim_combine(nproma, 6, nblks_c)),
+        p_u_out("p_u_out", dim_combine(nproma, nlev, nblks_c)),
+        p_v_out("p_v_out", dim_combine(nproma, nlev, nblks_c))
+  {}
+};
+
+/// ValueTypes to test with
+typedef ::testing::Types<float, double> ValueTypes;
+
+TYPED_TEST_SUITE(Edges2CellsVectorTest, ValueTypes);
 
-// Test for the double precision (dp) version.
-TEST(Edges2CellsTest, DPTest) {
-  // Allocate and fill input arrays.
-  std::vector<double> p_vn_in(num_elements<double>(nproma, nlev, nblks_e), 1.0);
-  std::vector<double> p_vt_in(num_elements<double>(nproma, nlev, nblks_e), 1.0);
-  // cell_edge_idx and cell_edge_blk: dimensions [nproma, nblks_c, 3]
-  std::vector<int> cell_edge_idx(num_elements<int>(nproma, nblks_c, 3), 1);
-  std::vector<int> cell_edge_blk(num_elements<int>(nproma, nblks_c, 3), 1);
-
-  // Here we set cell_edge_idx to 1, 2, 1 for every triple.
-  for (int i = 0; i < num_elements<int>(nproma, nblks_c, 3); i += 3) {
-    cell_edge_idx[i] = 1;
-    cell_edge_idx[i + 1] = 2;
-    cell_edge_idx[i + 2] = 1;
+TYPED_TEST(Edges2CellsVectorTest, BasicTest) {
+  constexpr int nproma = this->nproma;
+  constexpr int nlev = this->nlev;
+  constexpr int nblks_e = this->nblks_e;
+  constexpr int nblks_c = this->nblks_c;
+  constexpr int num_edges = this->num_edges;
+
+  // Define indexing helpers
+  const auto &vn_at = at<nproma, nlev, nblks_e>;
+  const auto &vt_at = at<nproma, nlev, nblks_e>;
+  const auto &edge_idx_at = at<nproma, nblks_c, num_edges>;
+  const auto &edge_blk_at = at<nproma, nblks_c, num_edges>;
+  const auto &bln_at = at<nproma, 6, nblks_c>;
+  const auto &out_at = at<nproma, nlev, nblks_c>;
+
+  // Create host mirror views
+  auto p_vn_in_h = Kokkos::create_mirror_view(this->p_vn_in);
+  auto p_vt_in_h = Kokkos::create_mirror_view(this->p_vt_in);
+  auto cell_edge_idx_h = Kokkos::create_mirror_view(this->cell_edge_idx);
+  auto cell_edge_blk_h = Kokkos::create_mirror_view(this->cell_edge_blk);
+  auto e_bln_c_u_h = Kokkos::create_mirror_view(this->e_bln_c_u);
+  auto e_bln_c_v_h = Kokkos::create_mirror_view(this->e_bln_c_v);
+  auto p_u_out_h = Kokkos::create_mirror_view(this->p_u_out);
+  auto p_v_out_h = Kokkos::create_mirror_view(this->p_v_out);
+
+  // Initialize with simple values
+  for (int ib = 0; ib < nblks_e; ++ib) {
+    for (int ik = 0; ik < nlev; ++ik) {
+      for (int ic = 0; ic < nproma; ++ic) {
+        p_vn_in_h[vn_at(ic, ik, ib)] = static_cast<TypeParam>(1.0 + ic + ik + ib);
+        p_vt_in_h[vt_at(ic, ik, ib)] = static_cast<TypeParam>(2.0 + ic + ik + ib);
+      }
+    }
   }
-  // Similarly, set cell_edge_blk to all ones (valid since nblks_e=2, so index 1
-  // means block 0 after subtracting 1). e_bln_c_u and e_bln_c_v: dimensions
-  // [nproma, 6, nblks_c]
-  std::vector<double> e_bln_c_u(num_elements<double>(nproma, 6, nblks_c), 1.0);
-  std::vector<double> e_bln_c_v(num_elements<double>(nproma, 6, nblks_c), 1.0);
-  // Output arrays: dimensions [nproma, nlev, nblks_c]
-  std::vector<double> p_u_out(num_elements<double>(nproma, nlev, nblks_c), 0.0);
-  std::vector<double> p_v_out(num_elements<double>(nproma, nlev, nblks_c), 0.0);
-
-  std::vector<double> p_u_ref(num_elements<double>(nproma, nlev, nblks_c), 6.0);
-  std::vector<double> p_v_ref(num_elements<double>(nproma, nlev, nblks_c), 6.0);
-
-  // Call the dp (double precision) version.
-  edges2cells_vector_lib<double>(
-      p_vn_in.data(), p_vt_in.data(), cell_edge_idx.data(),
-      cell_edge_blk.data(), e_bln_c_u.data(), e_bln_c_v.data(), p_u_out.data(),
-      p_v_out.data(), i_startblk, i_endblk, i_startidx_in, i_endidx_in, slev,
-      elev, nproma, nlev, nblks_e, nblks_c);
-
-  // Check that for each computed cell in p_u_out and p_v_out, the value is 6.
-  // This is because for each cell, the kernel adds 6 terms of 1*1.
-  for (size_t idx = 0; idx < p_u_out.size(); ++idx) {
-    EXPECT_NEAR(p_u_out[idx], p_u_ref[idx], 1e-12);
-    EXPECT_NEAR(p_v_out[idx], p_v_ref[idx], 1e-12);
+
+  // Set each cell to connect to 3 edges
+  for (int ib = 0; ib < nblks_c; ++ib) {
+    for (int ic = 0; ic < nproma; ++ic) {
+      // Edge indices are 1-indexed in the function
+      cell_edge_idx_h[edge_idx_at(ic, ib, 0)] = 1;
+      cell_edge_idx_h[edge_idx_at(ic, ib, 1)] = 2;
+      cell_edge_idx_h[edge_idx_at(ic, ib, 2)] = 3;
+
+      // Edge blocks are 1-indexed in the function
+      cell_edge_blk_h[edge_blk_at(ic, ib, 0)] = 1;
+      cell_edge_blk_h[edge_blk_at(ic, ib, 1)] = 1;
+      cell_edge_blk_h[edge_blk_at(ic, ib, 2)] = 1;
+
+      // Initialize bilinear coefficients
+      for (int j = 0; j < 6; ++j) {
+        e_bln_c_u_h[bln_at(ic, j, ib)] = static_cast<TypeParam>(0.1 * (j + 1));
+        e_bln_c_v_h[bln_at(ic, j, ib)] = static_cast<TypeParam>(0.05 * (j + 1));
+      }
+
+      // Initialize output to zero
+      for (int ik = 0; ik < nlev; ++ik) {
+        p_u_out_h[out_at(ic, ik, ib)] = static_cast<TypeParam>(0.0);
+        p_v_out_h[out_at(ic, ik, ib)] = static_cast<TypeParam>(0.0);
+      }
+    }
+  }
+
+  // Copy to device
+  Kokkos::deep_copy(this->p_vn_in, p_vn_in_h);
+  Kokkos::deep_copy(this->p_vt_in, p_vt_in_h);
+  Kokkos::deep_copy(this->cell_edge_idx, cell_edge_idx_h);
+  Kokkos::deep_copy(this->cell_edge_blk, cell_edge_blk_h);
+  Kokkos::deep_copy(this->e_bln_c_u, e_bln_c_u_h);
+  Kokkos::deep_copy(this->e_bln_c_v, e_bln_c_v_h);
+  Kokkos::deep_copy(this->p_u_out, p_u_out_h);
+  Kokkos::deep_copy(this->p_v_out, p_v_out_h);
+
+  Kokkos::fence();
+
+  // Call the function
+  edges2cells_vector_lib<TypeParam>(
+      this->p_vn_in.data(), this->p_vt_in.data(),
+      this->cell_edge_idx.data(), this->cell_edge_blk.data(),
+      this->e_bln_c_u.data(), this->e_bln_c_v.data(),
+      this->p_u_out.data(), this->p_v_out.data(),
+      this->i_startblk, this->i_endblk, this->i_startidx_in, this->i_endidx_in,
+      this->slev, this->elev, nproma, nlev, nblks_e, nblks_c);
+
+  Kokkos::fence();
+
+  // Copy results back to host
+  Kokkos::deep_copy(p_u_out_h, this->p_u_out);
+  Kokkos::deep_copy(p_v_out_h, this->p_v_out);
+
+  // Compute expected results on host
+  using host_space = Kokkos::HostSpace;
+  Kokkos::View<TypeParam**, host_space> expected_u("expected_u", nproma, nlev);
+  Kokkos::View<TypeParam**, host_space> expected_v("expected_v", nproma, nlev);
+
+  for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
+    int i_startidx, i_endidx;
+    get_indices_c_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb,
+                      this->i_startblk, this->i_endblk, i_startidx, i_endidx);
+    
+
+    for (int jk = this->slev; jk <= this->elev; ++jk) {
+      for (int jc = i_startidx; jc <= i_endidx; ++jc) {
+        // Compute expected values
+        expected_u(jc, jk) =
+            e_bln_c_u_h[bln_at(jc, 0, jb)] *
+                p_vn_in_h[vn_at(cell_edge_idx_h[edge_idx_at(jc, jb, 0)] - 1, jk,
+                             cell_edge_blk_h[edge_blk_at(jc, jb, 0)] - 1)] +
+            e_bln_c_u_h[bln_at(jc, 1, jb)] *
+                p_vt_in_h[vt_at(cell_edge_idx_h[edge_idx_at(jc, jb, 0)] - 1, jk,
+                             cell_edge_blk_h[edge_blk_at(jc, jb, 0)] - 1)] +
+            e_bln_c_u_h[bln_at(jc, 2, jb)] *
+                p_vn_in_h[vn_at(cell_edge_idx_h[edge_idx_at(jc, jb, 1)] - 1, jk,
+                             cell_edge_blk_h[edge_blk_at(jc, jb, 1)] - 1)] +
+            e_bln_c_u_h[bln_at(jc, 3, jb)] *
+                p_vt_in_h[vt_at(cell_edge_idx_h[edge_idx_at(jc, jb, 1)] - 1, jk,
+                             cell_edge_blk_h[edge_blk_at(jc, jb, 1)] - 1)] +
+            e_bln_c_u_h[bln_at(jc, 4, jb)] *
+                p_vn_in_h[vn_at(cell_edge_idx_h[edge_idx_at(jc, jb, 2)] - 1, jk,
+                             cell_edge_blk_h[edge_blk_at(jc, jb, 2)] - 1)] +
+            e_bln_c_u_h[bln_at(jc, 5, jb)] *
+                p_vt_in_h[vt_at(cell_edge_idx_h[edge_idx_at(jc, jb, 2)] - 1, jk,
+                             cell_edge_blk_h[edge_blk_at(jc, jb, 2)] - 1)];
+
+        expected_v(jc, jk) =
+            e_bln_c_v_h[bln_at(jc, 0, jb)] *
+                p_vn_in_h[vn_at(cell_edge_idx_h[edge_idx_at(jc, jb, 0)] - 1, jk,
+                             cell_edge_blk_h[edge_blk_at(jc, jb, 0)] - 1)] +
+            e_bln_c_v_h[bln_at(jc, 1, jb)] *
+                p_vt_in_h[vt_at(cell_edge_idx_h[edge_idx_at(jc, jb, 0)] - 1, jk,
+                             cell_edge_blk_h[edge_blk_at(jc, jb, 0)] - 1)] +
+            e_bln_c_v_h[bln_at(jc, 2, jb)] *
+                p_vn_in_h[vn_at(cell_edge_idx_h[edge_idx_at(jc, jb, 1)] - 1, jk,
+                             cell_edge_blk_h[edge_blk_at(jc, jb, 1)] - 1)] +
+            e_bln_c_v_h[bln_at(jc, 3, jb)] *
+                p_vt_in_h[vt_at(cell_edge_idx_h[edge_idx_at(jc, jb, 1)] - 1, jk,
+                             cell_edge_blk_h[edge_blk_at(jc, jb, 1)] - 1)] +
+            e_bln_c_v_h[bln_at(jc, 4, jb)] *
+                p_vn_in_h[vn_at(cell_edge_idx_h[edge_idx_at(jc, jb, 2)] - 1, jk,
+                             cell_edge_blk_h[edge_blk_at(jc, jb, 2)] - 1)] +
+            e_bln_c_v_h[bln_at(jc, 5, jb)] *
+                p_vt_in_h[vt_at(cell_edge_idx_h[edge_idx_at(jc, jb, 2)] - 1, jk,
+                             cell_edge_blk_h[edge_blk_at(jc, jb, 2)] - 1)];
+      }
+    }
+  }
+
+  // Verify results
+  for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
+    for (int jk = this->slev; jk <= this->elev; ++jk) {
+      for (int jc = this->i_startidx_in; jc <= this->i_endidx_in; ++jc) {
+        EXPECT_NEAR(p_u_out_h[out_at(jc, jk, jb)], expected_u(jc, jk), 1e-5)
+            << "u value mismatch at jc=" << jc << ", jk=" << jk;
+        EXPECT_NEAR(p_v_out_h[out_at(jc, jk, jb)], expected_v(jc, jk), 1e-5)
+            << "v value mismatch at jc=" << jc << ", jk=" << jk;
+      }
+    }
   }
 }
 
-// Test for the single precision (sp) version.
-TEST(Edges2CellsTest, SPTest) {
-  // Allocate and fill input arrays.
-  std::vector<float> p_vn_in(num_elements<float>(nproma, nlev, nblks_e), 1.0f);
-  std::vector<float> p_vt_in(num_elements<float>(nproma, nlev, nblks_e), 1.0f);
-  std::vector<int> cell_edge_idx(num_elements<int>(nproma, nblks_c, 3), 1);
-  std::vector<int> cell_edge_blk(num_elements<int>(nproma, nblks_c, 3), 1);
-  // Set cell_edge_idx values to 1, 2, 1.
-  for (int i = 0; i < num_elements<int>(nproma, nblks_c, 3); i += 3) {
-    cell_edge_idx[i] = 1;
-    cell_edge_idx[i + 1] = 2;
-    cell_edge_idx[i + 2] = 1;
+TYPED_TEST(Edges2CellsVectorTest, RandomTest) {
+  constexpr int nproma = this->nproma;
+  constexpr int nlev = this->nlev;
+  constexpr int nblks_e = this->nblks_e;
+  constexpr int nblks_c = this->nblks_c;
+  constexpr int num_edges = this->num_edges;
+
+  // Define indexing helpers
+  const auto &vn_at = at<nproma, nlev, nblks_e>;
+  const auto &vt_at = at<nproma, nlev, nblks_e>;
+  const auto &edge_idx_at = at<nproma, nblks_c, num_edges>;
+  const auto &edge_blk_at = at<nproma, nblks_c, num_edges>;
+  const auto &bln_at = at<nproma, 6, nblks_c>;
+  const auto &out_at = at<nproma, nlev, nblks_c>;
+
+  // Create host mirror views
+  auto p_vn_in_h = Kokkos::create_mirror_view(this->p_vn_in);
+  auto p_vt_in_h = Kokkos::create_mirror_view(this->p_vt_in);
+  auto cell_edge_idx_h = Kokkos::create_mirror_view(this->cell_edge_idx);
+  auto cell_edge_blk_h = Kokkos::create_mirror_view(this->cell_edge_blk);
+  auto e_bln_c_u_h = Kokkos::create_mirror_view(this->e_bln_c_u);
+  auto e_bln_c_v_h = Kokkos::create_mirror_view(this->e_bln_c_v);
+  auto p_u_out_h = Kokkos::create_mirror_view(this->p_u_out);
+  auto p_v_out_h = Kokkos::create_mirror_view(this->p_v_out);
+
+  // Use fixed seed for reproducibility
+  std::mt19937 gen(42);
+  std::uniform_int_distribution<int> edge_distrib(1, num_edges);
+  std::uniform_int_distribution<int> block_distrib(1, nblks_e);
+  std::uniform_real_distribution<TypeParam> real_distrib(0.01, 1.0);
+
+  // Initialize with random values
+  for (int ib = 0; ib < nblks_e; ++ib) {
+    for (int ik = 0; ik < nlev; ++ik) {
+      for (int ic = 0; ic < nproma; ++ic) {
+        p_vn_in_h[vn_at(ic, ik, ib)] = real_distrib(gen);
+        p_vt_in_h[vt_at(ic, ik, ib)] = real_distrib(gen);
+      }
+    }
+  }
+
+  // // Set each cell to connect to random edges
+  for (int ib = 0; ib < nblks_c; ++ib) {
+    for (int ic = 0; ic < nproma; ++ic) {
+      // Edge indices are 1-indexed in the function
+      cell_edge_idx_h[edge_idx_at(ic, ib, 0)] = edge_distrib(gen);
+      cell_edge_idx_h[edge_idx_at(ic, ib, 1)] = edge_distrib(gen);
+      cell_edge_idx_h[edge_idx_at(ic, ib, 2)] = edge_distrib(gen);
+
+      // Edge blocks are 1-indexed in the function
+      cell_edge_blk_h[edge_blk_at(ic, ib, 0)] = block_distrib(gen);
+      cell_edge_blk_h[edge_blk_at(ic, ib, 1)] = block_distrib(gen);
+      cell_edge_blk_h[edge_blk_at(ic, ib, 2)] = block_distrib(gen);
+
+      // Initialize random bilinear coefficients
+      for (int j = 0; j < 6; ++j) {
+        e_bln_c_u_h[bln_at(ic, j, ib)] = real_distrib(gen);
+        e_bln_c_v_h[bln_at(ic, j, ib)] = real_distrib(gen);
+      }
+
+      // Initialize output to zero
+      for (int ik = 0; ik < nlev; ++ik) {
+        p_u_out_h[out_at(ic, ik, ib)] = static_cast<TypeParam>(0.0);
+        p_v_out_h[out_at(ic, ik, ib)] = static_cast<TypeParam>(0.0);
+      }
+    }
   }
-  std::vector<float> e_bln_c_u(num_elements<float>(nproma, 6, nblks_c), 1.0f);
-  std::vector<float> e_bln_c_v(num_elements<float>(nproma, 6, nblks_c), 1.0f);
-  std::vector<float> p_u_out(num_elements<float>(nproma, nlev, nblks_c), 0.0f);
-  std::vector<float> p_v_out(num_elements<float>(nproma, nlev, nblks_c), 0.0f);
-
-  std::vector<float> p_u_ref(num_elements<float>(nproma, nlev, nblks_c), 6.0f);
-  std::vector<float> p_v_ref(num_elements<float>(nproma, nlev, nblks_c), 6.0f);
-
-  // Call the sp (float precision) version.
-  edges2cells_vector_lib<float>(
-      p_vn_in.data(), p_vt_in.data(), cell_edge_idx.data(),
-      cell_edge_blk.data(), e_bln_c_u.data(), e_bln_c_v.data(), p_u_out.data(),
-      p_v_out.data(), i_startblk, i_endblk, i_startidx_in, i_endidx_in, slev,
-      elev, nproma, nlev, nblks_e, nblks_c);
-
-  // Verify that every computed output equals 6.
-  for (size_t idx = 0; idx < p_u_out.size(); ++idx) {
-    EXPECT_NEAR(p_u_out[idx], p_u_ref[idx], 1e-5f);
-    EXPECT_NEAR(p_v_out[idx], p_v_ref[idx], 1e-5f);
+
+  // Copy to device
+  Kokkos::deep_copy(this->p_vn_in, p_vn_in_h);
+  Kokkos::deep_copy(this->p_vt_in, p_vt_in_h);
+  Kokkos::deep_copy(this->cell_edge_idx, cell_edge_idx_h);
+  Kokkos::deep_copy(this->cell_edge_blk, cell_edge_blk_h);
+  Kokkos::deep_copy(this->e_bln_c_u, e_bln_c_u_h);
+  Kokkos::deep_copy(this->e_bln_c_v, e_bln_c_v_h);
+  Kokkos::deep_copy(this->p_u_out, p_u_out_h);
+  Kokkos::deep_copy(this->p_v_out, p_v_out_h);
+
+  Kokkos::fence();
+
+  // Call the function
+  edges2cells_vector_lib<TypeParam>(
+      this->p_vn_in.data(), this->p_vt_in.data(),
+      this->cell_edge_idx.data(), this->cell_edge_blk.data(),
+      this->e_bln_c_u.data(), this->e_bln_c_v.data(),
+      this->p_u_out.data(), this->p_v_out.data(),
+      this->i_startblk, this->i_endblk, this->i_startidx_in, this->i_endidx_in,
+      this->slev, this->elev, nproma, nlev, nblks_e, nblks_c);
+
+  Kokkos::fence();
+
+  // Copy results back to host
+  Kokkos::deep_copy(p_u_out_h, this->p_u_out);
+  Kokkos::deep_copy(p_v_out_h, this->p_v_out);
+
+  // Compute expected results on host
+  using host_space = Kokkos::HostSpace;
+  Kokkos::View<TypeParam***, host_space> expected_u("expected_u", nproma, nlev, nblks_c);
+  Kokkos::View<TypeParam***, host_space> expected_v("expected_v", nproma, nlev, nblks_c);
+
+  for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
+    int i_startidx, i_endidx;
+    get_indices_c_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb,
+                      this->i_startblk, this->i_endblk, i_startidx, i_endidx);
+    
+    for (int jk = this->slev; jk <= this->elev; ++jk) {
+      for (int jc = i_startidx; jc <= i_endidx; ++jc) {
+        // Compute expected values
+        expected_u(jc, jk, jb) =
+            e_bln_c_u_h[bln_at(jc, 0, jb)] *
+                p_vn_in_h[vn_at(cell_edge_idx_h[edge_idx_at(jc, jb, 0)] - 1, jk,
+                             cell_edge_blk_h[edge_blk_at(jc, jb, 0)] - 1)] +
+            e_bln_c_u_h[bln_at(jc, 1, jb)] *
+                p_vt_in_h[vt_at(cell_edge_idx_h[edge_idx_at(jc, jb, 0)] - 1, jk,
+                             cell_edge_blk_h[edge_blk_at(jc, jb, 0)] - 1)] +
+            e_bln_c_u_h[bln_at(jc, 2, jb)] *
+                p_vn_in_h[vn_at(cell_edge_idx_h[edge_idx_at(jc, jb, 1)] - 1, jk,
+                             cell_edge_blk_h[edge_blk_at(jc, jb, 1)] - 1)] +
+            e_bln_c_u_h[bln_at(jc, 3, jb)] *
+                p_vt_in_h[vt_at(cell_edge_idx_h[edge_idx_at(jc, jb, 1)] - 1, jk,
+                             cell_edge_blk_h[edge_blk_at(jc, jb, 1)] - 1)] +
+            e_bln_c_u_h[bln_at(jc, 4, jb)] *
+                p_vn_in_h[vn_at(cell_edge_idx_h[edge_idx_at(jc, jb, 2)] - 1, jk,
+                             cell_edge_blk_h[edge_blk_at(jc, jb, 2)] - 1)] +
+            e_bln_c_u_h[bln_at(jc, 5, jb)] *
+                p_vt_in_h[vt_at(cell_edge_idx_h[edge_idx_at(jc, jb, 2)] - 1, jk,
+                             cell_edge_blk_h[edge_blk_at(jc, jb, 2)] - 1)];
+
+        expected_v(jc, jk, jb) =
+            e_bln_c_v_h[bln_at(jc, 0, jb)] *
+                p_vn_in_h[vn_at(cell_edge_idx_h[edge_idx_at(jc, jb, 0)] - 1, jk,
+                             cell_edge_blk_h[edge_blk_at(jc, jb, 0)] - 1)] +
+            e_bln_c_v_h[bln_at(jc, 1, jb)] *
+                p_vt_in_h[vt_at(cell_edge_idx_h[edge_idx_at(jc, jb, 0)] - 1, jk,
+                             cell_edge_blk_h[edge_blk_at(jc, jb, 0)] - 1)] +
+            e_bln_c_v_h[bln_at(jc, 2, jb)] *
+                p_vn_in_h[vn_at(cell_edge_idx_h[edge_idx_at(jc, jb, 1)] - 1, jk,
+                             cell_edge_blk_h[edge_blk_at(jc, jb, 1)] - 1)] +
+            e_bln_c_v_h[bln_at(jc, 3, jb)] *
+                p_vt_in_h[vt_at(cell_edge_idx_h[edge_idx_at(jc, jb, 1)] - 1, jk,
+                             cell_edge_blk_h[edge_blk_at(jc, jb, 1)] - 1)] +
+            e_bln_c_v_h[bln_at(jc, 4, jb)] *
+                p_vn_in_h[vn_at(cell_edge_idx_h[edge_idx_at(jc, jb, 2)] - 1, jk,
+                             cell_edge_blk_h[edge_blk_at(jc, jb, 2)] - 1)] +
+            e_bln_c_v_h[bln_at(jc, 5, jb)] *
+                p_vt_in_h[vt_at(cell_edge_idx_h[edge_idx_at(jc, jb, 2)] - 1, jk,
+                             cell_edge_blk_h[edge_blk_at(jc, jb, 2)] - 1)];
+      }
+    }
+  }
+
+  Kokkos::fence();
+
+  // Verify results
+  for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
+    for (int jk = this->slev; jk <= this->elev; ++jk) {
+      for (int jc = this->i_startidx_in; jc <= this->i_endidx_in; ++jc) {
+        EXPECT_NEAR(p_u_out_h[out_at(jc, jk, 0)], expected_u(jc, jk, 0), 1e-5)
+            << "u value mismatch at jc=" << jc << ", jk=" << jk;
+        EXPECT_NEAR(p_v_out_h[out_at(jc, jk, 0)], expected_v(jc, jk, 0), 1e-5)
+            << "v value mismatch at jc=" << jc << ", jk=" << jk;
+      }
+    }
   }
 }
-- 
GitLab


From fecec078c8572e6c0aebe86de7bdeb8368c22ccc Mon Sep 17 00:00:00 2001
From: Pradipta Samanta <samanta@dkrz.de>
Date: Mon, 7 Apr 2025 21:29:36 +0200
Subject: [PATCH 14/34] fixed a bug in mo_lib_interpolation_scalar

---
 src/interpolation/mo_lib_interpolation_scalar.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/interpolation/mo_lib_interpolation_scalar.cpp b/src/interpolation/mo_lib_interpolation_scalar.cpp
index 8910cb2..51edcda 100644
--- a/src/interpolation/mo_lib_interpolation_scalar.cpp
+++ b/src/interpolation/mo_lib_interpolation_scalar.cpp
@@ -569,7 +569,7 @@ void cell_avg_lib(const T *psi_c, const int *cell_neighbor_idx,
   UnmanagedConstInt3D iblk_view(cell_neighbor_blk, nproma, nblks_c,
                                 3); // cell_neighbour_blk
   // averaging coefficients, dim: (nproma,nlev,nblks_c)
-  UnmanagedConstT3D avg_coeff_view(avg_coeff, nproma, nlev, nblks_c);
+  UnmanagedConstT3D avg_coeff_view(avg_coeff, nproma, 4, nblks_c);
 
   // cell based variable after averaging, dim: (nproma,nlev,nblks_c)
   UnmanagedT3D avg_psi_c_view(avg_psi_c, nproma, nlev, nblks_c);
-- 
GitLab


From c6a39b5a36b8981f67495f2ea5f32f9af42a08c0 Mon Sep 17 00:00:00 2001
From: Pradipta Samanta <samanta@dkrz.de>
Date: Mon, 7 Apr 2025 21:30:40 +0200
Subject: [PATCH 15/34] corrected the way loop_exchange is defined in test
 CMake

---
 test/c/CMakeLists.txt | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/test/c/CMakeLists.txt b/test/c/CMakeLists.txt
index 9d21819..b2bba98 100644
--- a/test/c/CMakeLists.txt
+++ b/test/c/CMakeLists.txt
@@ -26,9 +26,6 @@ message(CHECK_PASS "done")
 # Find Kokkos (or use your existing Kokkos installation)
 # find_package(Kokkos REQUIRED)
 
-if(IM_ENABLE_LOOP_EXCHANGE)
-  target_compile_definitions(iconmath-interpolation PRIVATE __LOOP_EXCHANGE)
-endif()
 
 set(SOURCES
   main.cpp
@@ -43,6 +40,10 @@ set(SOURCES
 # Create the test executable from your test files, including main.cpp.
 add_executable(iconmath_test_c ${SOURCES})
 
+if(IM_ENABLE_LOOP_EXCHANGE)
+  target_compile_definitions(iconmath_test_c PRIVATE __LOOP_EXCHANGE)
+endif()
+
 target_include_directories(iconmath_test_c PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
 
 # Link the test executable with GoogleTest and Kokkos.
-- 
GitLab


From aab7047dbdb8b2686c8b2b3dd04f0d54e9396d30 Mon Sep 17 00:00:00 2001
From: Pradipta Samanta <samanta@dkrz.de>
Date: Mon, 7 Apr 2025 21:31:27 +0200
Subject: [PATCH 16/34] renamed few unit-tests

---
 test/c/test_interpolation_vector.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/test/c/test_interpolation_vector.cpp b/test/c/test_interpolation_vector.cpp
index fbab6ca..497b1d5 100644
--- a/test/c/test_interpolation_vector.cpp
+++ b/test/c/test_interpolation_vector.cpp
@@ -19,7 +19,7 @@
 
 /// Base test class for the edges2cells tests. Templated for the ValueType.
 template <typename ValueType>
-class Edges2CellsVectorTest : public ::testing::Test {
+class InterpolationVectorTest : public ::testing::Test {
 protected:
   // Constant dimensions
   static constexpr int nproma = 2;    // inner loop length
@@ -50,7 +50,7 @@ protected:
   Kokkos::View<ValueType*, memory_space> p_u_out;
   Kokkos::View<ValueType*, memory_space> p_v_out;
 
-  Edges2CellsVectorTest() 
+  InterpolationVectorTest() 
       : p_vn_in("p_vn_in", dim_combine(nproma, nlev, nblks_e)),
         p_vt_in("p_vt_in", dim_combine(nproma, nlev, nblks_e)),
         cell_edge_idx("cell_edge_idx", dim_combine(nproma, nblks_c, num_edges)),
@@ -65,9 +65,9 @@ protected:
 /// ValueTypes to test with
 typedef ::testing::Types<float, double> ValueTypes;
 
-TYPED_TEST_SUITE(Edges2CellsVectorTest, ValueTypes);
+TYPED_TEST_SUITE(InterpolationVectorTest, ValueTypes);
 
-TYPED_TEST(Edges2CellsVectorTest, BasicTest) {
+TYPED_TEST(InterpolationVectorTest, Edges2CellsSpecific) {
   constexpr int nproma = this->nproma;
   constexpr int nlev = this->nlev;
   constexpr int nblks_e = this->nblks_e;
@@ -226,7 +226,7 @@ TYPED_TEST(Edges2CellsVectorTest, BasicTest) {
   }
 }
 
-TYPED_TEST(Edges2CellsVectorTest, RandomTest) {
+TYPED_TEST(InterpolationVectorTest, Edges2CellsRandom) {
   constexpr int nproma = this->nproma;
   constexpr int nlev = this->nlev;
   constexpr int nblks_e = this->nblks_e;
-- 
GitLab


From b48e6d558213605da6fcb953f653405eb5dfdbbb Mon Sep 17 00:00:00 2001
From: Pradipta Samanta <samanta@dkrz.de>
Date: Mon, 7 Apr 2025 21:32:08 +0200
Subject: [PATCH 17/34] change the random number generation in
 test_horizontal_rot

---
 test/c/test_horizontal_rot.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/c/test_horizontal_rot.cpp b/test/c/test_horizontal_rot.cpp
index 92100e3..69e9d03 100644
--- a/test/c/test_horizontal_rot.cpp
+++ b/test/c/test_horizontal_rot.cpp
@@ -170,7 +170,7 @@ TYPED_TEST(HorizontalRotVertexTest, TestRotVertexAtmosRandom) {
   std::random_device rd;
   std::mt19937 gen(rd());
   std::uniform_int_distribution<int> int_distrib(0, nproma - 1);
-  std::uniform_real_distribution<TypeParam> real_distrib(-10.0, 10.0);
+  std::uniform_real_distribution<TypeParam> real_distrib(-1.0, 1.0);
 
   // Initialization with random values
   for (int i = 0; i < nproma; ++i) {
@@ -359,7 +359,7 @@ TYPED_TEST(HorizontalRotVertexTest, TestRotVertexRIRandom) {
   std::random_device rd;
   std::mt19937 gen(rd());
   std::uniform_int_distribution<int> int_distrib(0, nproma - 1);
-  std::uniform_real_distribution<TypeParam> real_distrib(-10.0, 10.0);
+  std::uniform_real_distribution<TypeParam> real_distrib(-1.0, 1.0);
 
   // Initialization with random values
   for (int i = 0; i < nproma; ++i) {
-- 
GitLab


From c1af2bc238c574276c43e47b585e7a4b3a8cd43e Mon Sep 17 00:00:00 2001
From: Pradipta Samanta <samanta@dkrz.de>
Date: Mon, 7 Apr 2025 21:32:59 +0200
Subject: [PATCH 18/34] Made an overhaul of test_interpolation_scalar

removed normalization of the input arrays in test_interpolation_scalar
---
 test/c/CMakeLists.txt                |    2 +-
 test/c/test_interpolation_scalar.cpp | 2075 ++++++++++++++++++++++----
 2 files changed, 1777 insertions(+), 300 deletions(-)

diff --git a/test/c/CMakeLists.txt b/test/c/CMakeLists.txt
index b2bba98..e707970 100644
--- a/test/c/CMakeLists.txt
+++ b/test/c/CMakeLists.txt
@@ -35,7 +35,7 @@ set(SOURCES
   # test_tdma_solver.cpp
   test_interpolation_vector.cpp
   # test_intp_rbf.cpp
-  # test_interpolation_scalar.cpp
+  test_interpolation_scalar.cpp
 )
 # Create the test executable from your test files, including main.cpp.
 add_executable(iconmath_test_c ${SOURCES})
diff --git a/test/c/test_interpolation_scalar.cpp b/test/c/test_interpolation_scalar.cpp
index 507ec3f..94c33dd 100644
--- a/test/c/test_interpolation_scalar.cpp
+++ b/test/c/test_interpolation_scalar.cpp
@@ -10,18 +10,12 @@
 // ---------------------------------------------------------------
 
 #include "mo_lib_interpolation_scalar.hpp"
+#include "mo_lib_loopindices.hpp"
 #include <Kokkos_Core.hpp>
 #include <gtest/gtest.h>
 #include <vector>
-
-// Free-function helpers for 3D and 4D array sizes (assumed column-major)
-template <typename T> size_t num_elements_3d(int d1, int d2, int d3) {
-  return static_cast<size_t>(d1) * d2 * d3;
-}
-
-template <typename T> size_t num_elements_4d(int d1, int d2, int d3, int d4) {
-  return static_cast<size_t>(d1) * d2 * d3 * d4;
-}
+#include <random>
+#include "dim_helper.hpp"
 
 // Define a helper struct that holds the two types.
 template <typename InT, typename OutT> struct MixedPrecision {
@@ -44,116 +38,99 @@ typedef ::testing::Types<MixedPrecision<double, double>,
 class interp_dimensions {
 public:
   // Constant dimensions.
-  static constexpr int nproma = 16; // inner loop length
-  static constexpr int nlev = 7;    // number of vertical levels
+  static constexpr int nproma = 2; // inner loop length
+  static constexpr int nlev = 3;    // number of vertical levels
   static constexpr int nblks_c = 2; // number of cell blocks
   static constexpr int nblks_e = 2; // number of edge blocks (for p_e_in)
   static constexpr int nblks_v = 2; // number of vertex blocks
+  static constexpr int cell_type = 6;
+  static constexpr int npromz_c = 2;
+
 
   // Parameter values.
   const int i_startblk = 0;
-  const int i_endblk = 1; // Test blocks [0, 1]
-  const int i_startidx = 2;
-  const int i_endidx = nproma - 3; // Partial range: 2 .. nproma-3
-  const int slev = 1;
+  const int i_endblk = nblks_c - 1; // Test blocks [0, 1]
+  const int i_startidx = 0;
+  const int i_endidx = nproma - 1; // Partial range: 2 .. nproma-3
+  const int slev = 0;
   const int elev = nlev - 1;    // Partial vertical range (1 .. nlev-1)
   const bool lacc = false;      // Not using ACC-specific behavior.
   const bool acc_async = false; // No asynchronous execution.
 };
 
-template <typename T>
-class InterpolationScalarTypedTestFixture : public ::testing::Test,
+template <typename ValueType>
+class InterpolationScalarSingleParamTest : public ::testing::Test,
                                             public interp_dimensions {
-public:
-  // Arrays used for verts2edges
-  std::vector<T> p_vertex_in;       // Dimensions: (nproma, nlev, nblks_v)
-  std::vector<int> edge_vertex_idx; // Dimensions: (nproma, nblks_e, 4)
-  std::vector<int> edge_vertex_blk; // Dimensions: (nproma, nblks_e, 4)
-  std::vector<T> coeff_int_edges;   // Dimensions: (nproma, 2, nblks_e)
-  std::vector<T> p_edge_out;        // Dimensions: (nproma, nlev, nblks_e)
-
-  // Arrays used for edges2verts
-  std::vector<T> p_edge_in;       // Dimensions: (nproma, nlev, nblks_e)
-  std::vector<int> edge_vert_idx; // Dimensions: (nproma, nblks_e, 6)
-  std::vector<int> edge_vert_blk; // Dimensions: (nproma, nblks_e, 6)
-  std::vector<T> v_int;           // Dimensions: (nproma, 6, nblks_v)
-  std::vector<T> p_vert_out;      // Dimensions: (nproma, nlev, nblks_v)
-
-  // Arrays used for edges2cells
-  // std::vector<T> p_edge_in;        // Dimensions: (nproma, nlev, nblks_e)
-  std::vector<int> edge_idx;      // Dimensions: (nproma, nblks_c, 3)
-  std::vector<int> edge_blk;      // Dimensions: (nproma, nblks_c, 3)
-  std::vector<T> coeff_int_cells; // Dimensions: (nproma, 3, nblks_c)
-  std::vector<T> p_cell_out;      // Dimensions: (nproma, nlev, nblks_c)
+  protected:
+
+  // Using Kokkos execution and memory spaces
+  using exec_space = Kokkos::DefaultExecutionSpace;
+  using memory_space = exec_space::memory_space;
+
+  Kokkos::View<ValueType*, memory_space> p_vertex_in;
+  Kokkos::View<int*, memory_space> edge_vertex_idx;
+  Kokkos::View<int*, memory_space> edge_vertex_blk;
+  Kokkos::View<ValueType*, memory_space> coeff_int_edges;
+  Kokkos::View<ValueType*, memory_space> p_edge_out;
+
+  // // Arrays used for edges2verts
+  Kokkos::View<ValueType*, memory_space> p_edge_in;
+  Kokkos::View<int*, memory_space> edge_vert_idx;
+  Kokkos::View<int*, memory_space> edge_vert_blk;
+  Kokkos::View<ValueType*, memory_space> v_int;
+  Kokkos::View<ValueType*, memory_space> p_vert_out;
+
+  // // Arrays used for edges2cells
+  Kokkos::View<int*, memory_space> edge_idx;      // Dimensions: (nproma, nblks_c, 3)
+  Kokkos::View<int*, memory_space> edge_blk;      // Dimensions: (nproma, nblks_c, 3)
+  Kokkos::View<ValueType*, memory_space> coeff_int_cells; // Dimensions: (nproma, 3, nblks_c)
+  Kokkos::View<ValueType*, memory_space> p_cell_out;      // Dimensions: (nproma, nlev, nblks_c)
 
   // Arrays used for verts2cells
-  std::vector<T> p_vert_in;        // Dimensions: (nproma, nlev, nblks_v)
-  std::vector<int> cell_index_idx; // Dimensions: (nproma, nblks_c, 3)
-  std::vector<int> cell_index_blk; // Dimensions: (nproma, nblks_c, 3)
+  Kokkos::View<ValueType*, memory_space> p_vert_in;        // Dimensions: (nproma, nlev, nblks_v)
+  Kokkos::View<int*, memory_space> cell_index_idx;         // Dimensions: (nproma, nblks_c, 3)
+  Kokkos::View<int*, memory_space> cell_index_blk;         // Dimensions: (nproma, nblks_c, 3)
 
   // Arrays used for avg_lib
-  std::vector<T> psi_c;               // Dimensions: (nproma, nlev, nblks_c)
-  std::vector<int> cell_neighbor_idx; // Dimensions: (nproma, nblks_c, 3)
-  std::vector<int> cell_neighbor_blk; // Dimensions: (nproma, nblks_c, 3)
-  std::vector<T> avg_coeff;           // Dimensions: (nproma, nlev, nblks_c)
-  std::vector<T> avg_psi_c;           // Dimensions: (nproma, nlev, nblks_c)
-
-  const int cell_type = 6;
-  const int npromz_c = 32;
-
-  InterpolationScalarTypedTestFixture() {
-    // Allocate and initialize arrays needed for verts2edges
-    p_vertex_in.resize(num_elements_3d<T>(nproma, nlev, nblks_v),
-                       static_cast<T>(1));
-    edge_vertex_idx.resize(num_elements_3d<int>(nproma, nblks_e, 4), 1);
-    edge_vertex_blk.resize(num_elements_3d<int>(nproma, nblks_e, 4), 0);
-    coeff_int_edges.resize(num_elements_3d<T>(nproma, 2, nblks_e),
-                           static_cast<T>(1));
-
-    p_edge_out.resize(num_elements_3d<T>(nproma, nlev, nblks_e),
-                      static_cast<T>(0));
-
-    // Allocate & Initialize arrays needed for edges2verts
-    p_edge_in.resize(num_elements_3d<T>(nproma, nlev, nblks_e),
-                     static_cast<T>(1));
-    edge_vert_idx.resize(num_elements_3d<int>(nproma, nblks_e, 6), 1);
-    edge_vert_blk.resize(num_elements_3d<int>(nproma, nblks_e, 6), 0);
-    v_int.resize(num_elements_3d<T>(nproma, 6, nblks_v), static_cast<T>(1));
-
-    p_vert_out.resize(num_elements_3d<T>(nproma, nlev, nblks_v),
-                      static_cast<T>(0));
-
-    // Allocate & Initialize arrays needed for edges2cells
-    edge_idx.resize(num_elements_3d<int>(nproma, nblks_c, 3), 1);
-    edge_blk.resize(num_elements_3d<int>(nproma, nblks_c, 3), 0);
-    coeff_int_cells.resize(num_elements_3d<T>(nproma, 3, nblks_c),
-                           static_cast<T>(1));
-
-    p_cell_out.resize(num_elements_3d<T>(nproma, nlev, nblks_c),
-                      static_cast<T>(0));
-
-    // Allocate and initialize arrays needed for verts2cells
-    p_vert_in.resize(num_elements_3d<T>(nproma, nlev, nblks_v),
-                     static_cast<T>(1));
-    cell_index_idx.resize(num_elements_3d<int>(nproma, nblks_c, 3), 1);
-    cell_index_blk.resize(num_elements_3d<int>(nproma, nblks_c, 3), 0);
-
-    // Allocate and initialize arrays needed for avg_lib
-    psi_c.resize(num_elements_3d<T>(nproma, nlev, nblks_c), static_cast<T>(1));
-    cell_neighbor_idx.resize(num_elements_3d<int>(nproma, nblks_c, 3), 1);
-    cell_neighbor_blk.resize(num_elements_3d<int>(nproma, nblks_c, 3), 0);
-    avg_coeff.resize(num_elements_3d<T>(nproma, nlev, nblks_c),
-                     static_cast<T>(1));
-
-    // Allocate output arrays and initialize to zero.
-    avg_psi_c.resize(num_elements_3d<T>(nproma, nlev, nblks_c),
-                     static_cast<T>(0));
-  }
+  Kokkos::View<ValueType*, memory_space> psi_c;               // Dimensions: (nproma, nlev, nblks_c)
+  Kokkos::View<int*, memory_space> cell_neighbor_idx;         // Dimensions: (nproma, nblks_c, 3)
+  Kokkos::View<int*, memory_space> cell_neighbor_blk;         // Dimensions: (nproma, nblks_c, 3)
+  Kokkos::View<ValueType*, memory_space> avg_coeff;           // Dimensions: (nproma, 4, nblks_c)
+  Kokkos::View<ValueType*, memory_space> avg_psi_c;           // Dimensions: (nproma, nlev, nblks_c)
+
+  InterpolationScalarSingleParamTest()
+      : p_vertex_in("p_vertex_in", nproma * nlev * nblks_v),
+        edge_vertex_idx("edge_vertex_idx", nproma * nblks_e * 4),
+        edge_vertex_blk("edge_vertex_blk", nproma * nblks_e * 4),
+        coeff_int_edges("coeff_int_edges", nproma * 2 * nblks_e),
+        p_edge_out("p_edge_out", nproma * nlev * nblks_e),
+        
+        p_edge_in("p_edge_in", nproma * nlev * nblks_e),
+        edge_vert_idx("edge_vert_idx", nproma * nblks_e * 6),
+        edge_vert_blk("edge_vert_blk", nproma * nblks_e * 6),
+        v_int("v_int", nproma * 6 * nblks_v),
+        p_vert_out("p_vert_out", nproma * nlev * nblks_v),
+
+        edge_idx("edge_idx", nproma * nblks_c * 3),
+        edge_blk("edge_blk", nproma * nblks_c * 3),
+        coeff_int_cells("coeff_int_cells", nproma * 3 * nblks_c),
+        p_cell_out("p_cell_out", nproma * nlev * nblks_c),
+
+        p_vert_in("p_vert_in", nproma * nlev * nblks_v),
+        cell_index_idx("cell_index_idx", nproma * nblks_c * 3),
+        cell_index_blk("cell_index_blk", nproma * nblks_c * 3),
+
+        psi_c("psi_c", nproma * nlev * nblks_c),
+        cell_neighbor_idx("cell_neighbor_idx", nproma * nblks_c * 3),
+        cell_neighbor_blk("cell_neighbor_blk", nproma * nblks_c * 3),
+        avg_coeff("avg_coeff", nproma * 4 * nblks_c),  // 4 coefficients (self + 3 neighbors)
+        avg_psi_c("avg_psi_c", nproma * nlev * nblks_c)
+  {}
 };
 
 typedef ::testing::Types<float, double> SingleType;
 
-TYPED_TEST_SUITE(InterpolationScalarTypedTestFixture, SingleType);
+TYPED_TEST_SUITE(InterpolationScalarSingleParamTest, SingleType);
 
 ////////////////////////////////////////////////////////////////////////////////
 //
@@ -161,29 +138,206 @@ TYPED_TEST_SUITE(InterpolationScalarTypedTestFixture, SingleType);
 //
 ////////////////////////////////////////////////////////////////////////////////
 
-TYPED_TEST(InterpolationScalarTypedTestFixture, Verts2Edges) {
+TYPED_TEST(InterpolationScalarSingleParamTest, Verts2EdgesSpecific) {
+
+  constexpr int nproma = this->nproma;
+  constexpr int nlev = this->nlev;
+  constexpr int nblks_v = this->nblks_v;
+  constexpr int nblks_e = this->nblks_e;
+
+  // Define indexing helpers
+  const auto &vertex_at = at<nproma, nlev, nblks_v>;
+  const auto &idx_at = at<nproma, nblks_e, 4>;
+  const auto &blk_at = at<nproma, nblks_e, 4>;
+  const auto &coeff_at = at<nproma, 2, nblks_e>;
+  const auto &edge_at = at<nproma, nlev, nblks_e>;
+
+  // Create host mirror views
+  auto p_vertex_in_h = Kokkos::create_mirror_view(this->p_vertex_in);
+  auto edge_vertex_idx_h = Kokkos::create_mirror_view(this->edge_vertex_idx);
+  auto edge_vertex_blk_h = Kokkos::create_mirror_view(this->edge_vertex_blk);
+  auto coeff_int_edges_h = Kokkos::create_mirror_view(this->coeff_int_edges);
+  auto p_edge_out_h = Kokkos::create_mirror_view(this->p_edge_out);
+
+  // Initialize with specific test values
+  for (int ib = 0; ib < nblks_v; ++ib) {
+    for (int ik = 0; ik < nlev; ++ik) {
+      for (int ic = 0; ic < nproma; ++ic) {
+        p_vertex_in_h[vertex_at(ic, ik, ib)] = static_cast<TypeParam>(1.0 + ic + ik * 0.1 + ib * 0.01);;
+      }
+    }
+  }
+
+  // Initialize edge connectivity indices with specific pattern
+  for (int ib = 0; ib < nblks_e; ++ib) {
+    for (int ic = 0; ic < nproma; ++ic) {
+      // Each edge connects to two specific vertices
+      edge_vertex_idx_h[idx_at(ic, ib, 0)] = ic % (nproma - 1); // First vertex index
+      edge_vertex_idx_h[idx_at(ic, ib, 1)] = (ic + 1) % nproma; // Second vertex index
+      edge_vertex_idx_h[idx_at(ic, ib, 2)] = 0; // Not used
+      edge_vertex_idx_h[idx_at(ic, ib, 3)] = 0; // Not used
+
+      edge_vertex_blk_h[blk_at(ic, ib, 0)] = ib % nblks_v; // First vertex block
+      edge_vertex_blk_h[blk_at(ic, ib, 1)] = (ib + 1) % nblks_v; // Second vertex block
+      edge_vertex_blk_h[blk_at(ic, ib, 2)] = 0; // Not used
+      edge_vertex_blk_h[blk_at(ic, ib, 3)] = 0; // Not used
+
+      coeff_int_edges_h[coeff_at(ic, 0, ib)] = static_cast<TypeParam>(0.5 + ic * 0.01);
+      coeff_int_edges_h[coeff_at(ic, 1, ib)] = static_cast<TypeParam>(0.5 + ib * 0.01);
+      
+      // Initialize output to zero and calculate expected results
+      for (int ik = 0; ik < nlev; ++ik) {
+        p_edge_out_h[edge_at(ic, ik, ib)] = static_cast<TypeParam>(0.0);
+      }
+    }
+  }
+
+  // copy data to device
+  Kokkos::deep_copy(this->p_vertex_in, p_vertex_in_h);
+  Kokkos::deep_copy(this->edge_vertex_idx, edge_vertex_idx_h);
+  Kokkos::deep_copy(this->edge_vertex_blk, edge_vertex_blk_h);
+  Kokkos::deep_copy(this->coeff_int_edges, coeff_int_edges_h);
+  Kokkos::deep_copy(this->p_edge_out, p_edge_out_h);
 
   verts2edges_scalar_lib<TypeParam>(
       this->p_vertex_in.data(), this->edge_vertex_idx.data(),
       this->edge_vertex_blk.data(), this->coeff_int_edges.data(),
       this->p_edge_out.data(), this->i_startblk, this->i_endblk,
-      this->i_startidx, this->i_endidx, this->slev, this->elev, this->nproma,
-      this->nlev, this->nblks_v, this->nblks_e, this->lacc);
-
-  // Check the outputs only for blocks in the range
-  // { [i_startblk, i_endblk], [slev,elev], [i_startidx, i_endidx] }
-  for (int block = this->i_startblk; block <= this->i_endblk; ++block) {
-    for (int level = this->slev; level < this->elev; ++level) {
-      for (int i = this->i_startidx; i < this->i_endidx; ++i) {
-        // Compute the linear index for a 3D array in column-major order:
-        size_t idx =
-            i + level * this->nproma + block * this->nproma * this->nlev;
-        // Since every contribution is 1 and there are 2 stencil points,
-        // expect 2.
-        EXPECT_NEAR(this->p_edge_out[idx], static_cast<TypeParam>(2),
-                    static_cast<TypeParam>(1e-5))
-            << "Failure at block " << block << ", level " << level << ", index "
-            << i;
+      this->i_startidx, this->i_endidx, this->slev, this->elev, nproma,
+      nlev, nblks_v, nblks_e, this->lacc);
+
+  // Copy results back to host
+  Kokkos::deep_copy(p_edge_out_h, this->p_edge_out);
+
+  // Expected results based on the specific test values
+  std::vector<TypeParam> expected_edges(12);
+  int idx = 0;
+  std::generate(expected_edges.begin(), expected_edges.end(), [&idx]() {
+    TypeParam values[] = {
+      1.505, 1.015, 1.605, 1.116, 1.705, 1.217,
+      1.525, 1.0251, 1.626, 1.1271, 1.727, 1.2291
+    };
+    return values[idx++];
+  });
+
+  // Verify results
+  for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
+    for (int jk = this->slev; jk <= this->elev; ++jk) {
+      for (int jv = this->i_startidx; jv <= this->i_endidx; ++jv) {
+        EXPECT_NEAR(p_edge_out_h[edge_at(jv, jk, jb)], 
+                   expected_edges[edge_at(jv, jk, jb)], 
+                   static_cast<TypeParam>(1e-5))
+            << "Failure at block " << jb << ", level " << jk << ", index " << jv;
+      }
+    }
+  }
+}
+
+// Repeat the same test with randomized data
+TYPED_TEST(InterpolationScalarSingleParamTest, Verts2EdgesRandom) {
+
+  constexpr int nproma = this->nproma;
+  constexpr int nlev = this->nlev;
+  constexpr int nblks_v = this->nblks_v;
+  constexpr int nblks_e = this->nblks_e;
+
+  // Define indexing helpers
+  const auto &vertex_at = at<nproma, nlev, nblks_v>;
+  const auto &idx_at = at<nproma, nblks_e, 4>;
+  const auto &blk_at = at<nproma, nblks_e, 4>;
+  const auto &coeff_at = at<nproma, 2, nblks_e>;
+  const auto &edge_at = at<nproma, nlev, nblks_e>;
+
+  // Create host mirror views
+  auto p_vertex_in_h = Kokkos::create_mirror_view(this->p_vertex_in);
+  auto edge_vertex_idx_h = Kokkos::create_mirror_view(this->edge_vertex_idx);
+  auto edge_vertex_blk_h = Kokkos::create_mirror_view(this->edge_vertex_blk);
+  auto coeff_int_edges_h = Kokkos::create_mirror_view(this->coeff_int_edges);
+  auto p_edge_out_h = Kokkos::create_mirror_view(this->p_edge_out);
+
+  // Use fixed seed for reproducibility
+  std::mt19937 gen(42);
+  std::uniform_int_distribution<int> edge_distrib(0, nproma - 1);
+  std::uniform_int_distribution<int> block_distrib(0, nblks_v - 1);
+  std::uniform_real_distribution<TypeParam> real_distrib(0.01, 1.0);
+
+  // Initialize with random values
+  for (int ib = 0; ib < nblks_v; ++ib) {
+    for (int ik = 0; ik < nlev; ++ik) {
+      for (int ic = 0; ic < nproma; ++ic) {
+        p_vertex_in_h[vertex_at(ic, ik, ib)] = real_distrib(gen);
+      }
+    }
+  }
+
+  // Initialize edge connectivity indices with random values
+  for (int ib = 0; ib < nblks_e; ++ib) {
+    for (int ic = 0; ic < nproma; ++ic) {
+      edge_vertex_idx_h[idx_at(ic, ib, 0)] = edge_distrib(gen);
+      edge_vertex_idx_h[idx_at(ic, ib, 1)] = edge_distrib(gen);
+      edge_vertex_idx_h[idx_at(ic, ib, 2)] = edge_distrib(gen);
+      edge_vertex_idx_h[idx_at(ic, ib, 3)] = edge_distrib(gen);
+
+      edge_vertex_blk_h[blk_at(ic, ib, 0)] = block_distrib(gen);
+      edge_vertex_blk_h[blk_at(ic, ib, 1)] = block_distrib(gen);
+      edge_vertex_blk_h[blk_at(ic, ib, 2)] = block_distrib(gen);
+      edge_vertex_blk_h[blk_at(ic, ib, 3)] = block_distrib(gen);
+
+      coeff_int_edges_h[coeff_at(ic, 0, ib)] = real_distrib(gen);
+      coeff_int_edges_h[coeff_at(ic, 1, ib)] = real_distrib(gen);
+
+      // Initialize output to zero
+      for (int ik = 0; ik < nlev; ++ik) {
+        p_edge_out_h[edge_at(ic, ik, ib)] = static_cast<TypeParam>(0.0);
+      }
+    }
+  }
+
+  // copy data to device
+  Kokkos::deep_copy(this->p_vertex_in, p_vertex_in_h);
+  Kokkos::deep_copy(this->edge_vertex_idx, edge_vertex_idx_h);
+  Kokkos::deep_copy(this->edge_vertex_blk, edge_vertex_blk_h);
+  Kokkos::deep_copy(this->coeff_int_edges, coeff_int_edges_h);
+  Kokkos::deep_copy(this->p_edge_out, p_edge_out_h);
+
+  // Call the function
+  verts2edges_scalar_lib<TypeParam>(
+      this->p_vertex_in.data(), this->edge_vertex_idx.data(),
+      this->edge_vertex_blk.data(), this->coeff_int_edges.data(),
+      this->p_edge_out.data(), this->i_startblk, this->i_endblk,
+      this->i_startidx, this->i_endidx, this->slev, this->elev, nproma,
+      nlev, nblks_v, nblks_e, this->lacc);
+
+  // Copy results back to host
+  Kokkos::deep_copy(p_edge_out_h, this->p_edge_out);
+
+  // Prepare expected results storage
+  using host_space = Kokkos::HostSpace;
+  Kokkos::View<TypeParam***, host_space> expected_edges("expected_edges", nproma, nlev, nblks_e);
+
+  for (int ib = this->i_startblk; ib <= this->i_endblk; ++ib) {
+    for (int ik = this->slev; ik <= this->elev; ++ik) {
+      for (int ic = this->i_startidx; ic <= this->i_endidx; ++ic) {
+        // Compute expected values
+        expected_edges(ic, ik, ib) =
+            coeff_int_edges_h[coeff_at(ic, 0, ib)] *
+                p_vertex_in_h[vertex_at(edge_vertex_idx_h[idx_at(ic, ib, 0)], ik,
+                                        edge_vertex_blk_h[blk_at(ic, ib, 0)])] +
+            coeff_int_edges_h[coeff_at(ic, 1, ib)] *
+                p_vertex_in_h[vertex_at(edge_vertex_idx_h[idx_at(ic, ib, 1)], ik,
+                                        edge_vertex_blk_h[blk_at(ic, ib, 1)])];
+      }
+    }
+  }
+
+  // Verify results
+  for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
+    for (int jk = this->slev; jk <= this->elev; ++jk) {
+      for (int jv = this->i_startidx; jv <= this->i_endidx; ++jv) {
+        EXPECT_NEAR(p_edge_out_h[edge_at(jv, jk, jb)], 
+                   expected_edges(jv, jk, jb), 
+                   static_cast<TypeParam>(1e-5))
+            << "Failure at block " << jb << ", level " << jk << ", index " << jv;
       }
     }
   }
@@ -195,29 +349,199 @@ TYPED_TEST(InterpolationScalarTypedTestFixture, Verts2Edges) {
 //
 ////////////////////////////////////////////////////////////////////////////////
 
-TYPED_TEST(InterpolationScalarTypedTestFixture, Edges2Verts) {
+TYPED_TEST(InterpolationScalarSingleParamTest, Edges2VertsSpecific) {
+  constexpr int nproma = this->nproma;
+  constexpr int nlev = this->nlev;
+  constexpr int nblks_e = this->nblks_e;
+  constexpr int nblks_v = this->nblks_v;
+
+  // Define indexing helpers
+  const auto &edge_at = at<nproma, nlev, nblks_e>;
+  const auto &idx_at = at<nproma, nblks_v, 6>;
+  const auto &blk_at = at<nproma, nblks_v, 6>;
+  const auto &coeff_at = at<nproma, 6, nblks_v>;
+  const auto &vert_at = at<nproma, nlev, nblks_v>;
+
+  // Create host mirror views
+  auto p_edge_in_h = Kokkos::create_mirror_view(this->p_edge_in);
+  auto edge_vert_idx_h = Kokkos::create_mirror_view(this->edge_vert_idx);
+  auto edge_vert_blk_h = Kokkos::create_mirror_view(this->edge_vert_blk);
+  auto v_int_h = Kokkos::create_mirror_view(this->v_int);
+  auto p_vert_out_h = Kokkos::create_mirror_view(this->p_vert_out);
+
+  // Initialize with index-based test values
+  for (int ib = 0; ib < nblks_e; ++ib) {
+    for (int ik = 0; ik < nlev; ++ik) {
+      for (int ic = 0; ic < nproma; ++ic) {
+        // Value depends on all three indices
+        p_edge_in_h[edge_at(ic, ik, ib)] = static_cast<TypeParam>(1.0 + ic + ik * 0.1 + ib * 0.01);
+      }
+    }
+  }
+
+  // Initialize vertex connectivity indices with specific pattern
+  for (int ib = 0; ib < nblks_v; ++ib) {
+    for (int ic = 0; ic < nproma; ++ic) {
+      // Each vertex connects to 6 edges
+      for (int j = 0; j < 6; ++j) {
+        // Edge indices with a pattern
+        edge_vert_idx_h[idx_at(ic, ib, j)] = (ic + j) % nproma;
+        edge_vert_blk_h[blk_at(ic, ib, j)] = (ib + j % 2) % nblks_e;
+        
+        // Interpolation coefficients that depend on indices
+        v_int_h[coeff_at(ic, j, ib)] = static_cast<TypeParam>(1.0 / 6.0 + j * 0.01);
+}
+      
+      // Initialize output to zero
+      for (int ik = 0; ik < nlev; ++ik) {
+        p_vert_out_h[vert_at(ic, ik, ib)] = static_cast<TypeParam>(0.0);
+      }
+    }
+  }
 
+  // Copy to device
+  Kokkos::deep_copy(this->p_edge_in, p_edge_in_h);
+  Kokkos::deep_copy(this->edge_vert_idx, edge_vert_idx_h);
+  Kokkos::deep_copy(this->edge_vert_blk, edge_vert_blk_h);
+  Kokkos::deep_copy(this->v_int, v_int_h);
+  Kokkos::deep_copy(this->p_vert_out, p_vert_out_h);
+
+  // Call the function under test
   edges2verts_scalar_lib<TypeParam>(
       this->p_edge_in.data(), this->edge_vert_idx.data(),
-      this->edge_vert_blk.data(), this->v_int.data(), this->p_vert_out.data(),
-      this->i_startblk, this->i_endblk, this->i_startidx, this->i_endidx,
-      this->slev, this->elev, this->nproma, this->nlev, this->nblks_e,
-      this->nblks_v, this->lacc);
-
-  // Check the outputs only for blocks in the range
-  // { [i_startblk, i_endblk], [slev,elev], [i_startidx_in, i_endidx_in] }
-  for (int block = this->i_startblk; block <= this->i_endblk; ++block) {
-    for (int level = this->slev; level < this->elev; ++level) {
-      for (int i = this->i_startidx; i < this->i_endidx; ++i) {
-        // Compute the linear index for a 3D array in column-major order:
-        size_t idx =
-            i + level * this->nproma + block * this->nproma * this->nlev;
-        // Since every contribution is 1 and there are 6 stencil points,
-        // expect 6.
-        EXPECT_NEAR(this->p_vert_out[idx], static_cast<TypeParam>(6),
-                    static_cast<TypeParam>(1e-5))
-            << "Failure at block " << block << ", level " << level << ", index "
-            << i;
+      this->edge_vert_blk.data(), this->v_int.data(),
+      this->p_vert_out.data(), this->i_startblk, this->i_endblk,
+      this->i_startidx, this->i_endidx, this->slev, this->elev,
+      nproma, nlev, nblks_e, nblks_v, this->lacc);
+
+  // Copy results back to host
+  Kokkos::deep_copy(p_vert_out_h, this->p_vert_out);
+
+  // Expected results based on the specific test values
+  std::vector<TypeParam> expected_verts(12);
+  int idx = 0;
+  std::generate(expected_verts.begin(), expected_verts.end(), [&idx]() {
+    TypeParam values[] = {
+      1.7459, 1.7159, 1.8609, 1.8309, 1.9759, 1.9459,
+      1.7456, 1.7156, 1.8606, 1.8306, 1.9756, 1.9456
+    };
+    return values[idx++];
+  });
+
+  // Verify results
+  for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
+    for (int jk = this->slev; jk <= this->elev; ++jk) {
+      for (int jv = this->i_startidx; jv <= this->i_endidx; ++jv) {
+        EXPECT_NEAR(p_vert_out_h[vert_at(jv, jk, jb)], 
+                   expected_verts[vert_at(jv, jk, jb)], 
+                   static_cast<TypeParam>(1e-5))
+            << "Failure at block " << jb << ", level " << jk << ", index " << jv;
+      }
+    }
+  }
+}
+
+TYPED_TEST(InterpolationScalarSingleParamTest, Edges2VertsRandom) {
+  constexpr int nproma = this->nproma;
+  constexpr int nlev = this->nlev;
+  constexpr int nblks_e = this->nblks_e;
+  constexpr int nblks_v = this->nblks_v;
+
+  // Define indexing helpers
+  const auto &edge_at = at<nproma, nlev, nblks_e>;
+  const auto &idx_at = at<nproma, nblks_v, 6>;
+  const auto &blk_at = at<nproma, nblks_v, 6>;
+  const auto &coeff_at = at<nproma, 6, nblks_v>;
+  const auto &vert_at = at<nproma, nlev, nblks_v>;
+
+  // Create host mirror views
+  auto p_edge_in_h = Kokkos::create_mirror_view(this->p_edge_in);
+  auto edge_vert_idx_h = Kokkos::create_mirror_view(this->edge_vert_idx);
+  auto edge_vert_blk_h = Kokkos::create_mirror_view(this->edge_vert_blk);
+  auto v_int_h = Kokkos::create_mirror_view(this->v_int);
+  auto p_vert_out_h = Kokkos::create_mirror_view(this->p_vert_out);
+
+  // Use fixed seed for reproducibility
+  std::mt19937 gen(42);
+  std::uniform_int_distribution<int> edge_distrib(0, nproma - 1);
+  std::uniform_int_distribution<int> block_distrib(0, nblks_e - 1);
+  std::uniform_real_distribution<TypeParam> real_distrib(0.01, 1.0);
+
+  // Initialize with random values
+  for (int ib = 0; ib < nblks_e; ++ib) {
+    for (int ik = 0; ik < nlev; ++ik) {
+      for (int ic = 0; ic < nproma; ++ic) {
+        p_edge_in_h[edge_at(ic, ik, ib)] = real_distrib(gen);
+      }
+    }
+  }
+
+  // Initialize vertex connectivity indices with random values
+  for (int ib = 0; ib < nblks_v; ++ib) {
+    for (int ic = 0; ic < nproma; ++ic) {
+      // Each vertex connects to 6 edges
+      for (int j = 0; j < 6; ++j) {
+        edge_vert_idx_h[idx_at(ic, ib, j)] = edge_distrib(gen);
+        edge_vert_blk_h[blk_at(ic, ib, j)] = block_distrib(gen);
+        
+        // Random interpolation coefficients
+        v_int_h[coeff_at(ic, j, ib)] = real_distrib(gen) / 6.0; // Scaled to ensure reasonable sums
+      }
+      
+      // Initialize output to zero
+      for (int ik = 0; ik < nlev; ++ik) {
+        p_vert_out_h[vert_at(ic, ik, ib)] = static_cast<TypeParam>(0.0);
+      }
+    }
+  }
+
+  // Copy to device
+  Kokkos::deep_copy(this->p_edge_in, p_edge_in_h);
+  Kokkos::deep_copy(this->edge_vert_idx, edge_vert_idx_h);
+  Kokkos::deep_copy(this->edge_vert_blk, edge_vert_blk_h);
+  Kokkos::deep_copy(this->v_int, v_int_h);
+  Kokkos::deep_copy(this->p_vert_out, p_vert_out_h);
+
+  // Call the function under test
+  edges2verts_scalar_lib<TypeParam>(
+      this->p_edge_in.data(), this->edge_vert_idx.data(),
+      this->edge_vert_blk.data(), this->v_int.data(),
+      this->p_vert_out.data(), this->i_startblk, this->i_endblk,
+      this->i_startidx, this->i_endidx, this->slev, this->elev,
+      nproma, nlev, nblks_e, nblks_v, this->lacc);
+
+  // Copy results back to host
+  Kokkos::deep_copy(p_vert_out_h, this->p_vert_out);
+
+  // Prepare expected results storage
+  using host_space = Kokkos::HostSpace;
+  Kokkos::View<TypeParam***, host_space> expected_verts("expected_verts", nproma, nlev, nblks_v);
+
+  // Compute expected values
+  for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
+    for (int jk = this->slev; jk <= this->elev; ++jk) {
+      for (int jv = this->i_startidx; jv <= this->i_endidx; ++jv) {
+        expected_verts(jv, jk, jb) = 0.0;
+        
+        for (int j = 0; j < 6; ++j) {
+          int edge_idx = edge_vert_idx_h[idx_at(jv, jb, j)];
+          int edge_blk = edge_vert_blk_h[blk_at(jv, jb, j)];
+          TypeParam coeff = v_int_h[coeff_at(jv, j, jb)];
+          
+          expected_verts(jv, jk, jb) += coeff * p_edge_in_h[edge_at(edge_idx, jk, edge_blk)];
+        }
+      }
+    }
+  }
+
+  // Verify results
+  for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
+    for (int jk = this->slev; jk <= this->elev; ++jk) {
+      for (int jv = this->i_startidx; jv <= this->i_endidx; ++jv) {
+        EXPECT_NEAR(p_vert_out_h[vert_at(jv, jk, jb)], 
+                   expected_verts(jv, jk, jb), 
+                   static_cast<TypeParam>(1e-5))
+            << "Failure at block " << jb << ", level " << jk << ", index " << jv;
       }
     }
   }
@@ -229,55 +553,409 @@ TYPED_TEST(InterpolationScalarTypedTestFixture, Edges2Verts) {
 //
 ////////////////////////////////////////////////////////////////////////////////
 
-TYPED_TEST(InterpolationScalarTypedTestFixture, Edges2Cells) {
+TYPED_TEST(InterpolationScalarSingleParamTest, Edges2CellsSpecific) {
+  constexpr int nproma = this->nproma;
+  constexpr int nlev = this->nlev;
+  constexpr int nblks_e = this->nblks_e;
+  constexpr int nblks_c = this->nblks_c;
+
+  // Define indexing helpers
+  const auto &edge_at = at<nproma, nlev, nblks_e>;
+  const auto &idx_at = at<nproma, nblks_c, 3>;
+  const auto &blk_at = at<nproma, nblks_c, 3>;
+  const auto &coeff_at = at<nproma, 3, nblks_c>;
+  const auto &cell_at = at<nproma, nlev, nblks_c>;
+
+  // Create host mirror views
+  auto p_edge_in_h = Kokkos::create_mirror_view(this->p_edge_in);
+  auto edge_idx_h = Kokkos::create_mirror_view(this->edge_idx);
+  auto edge_blk_h = Kokkos::create_mirror_view(this->edge_blk);
+  auto coeff_int_cells_h = Kokkos::create_mirror_view(this->coeff_int_cells);
+  auto p_cell_out_h = Kokkos::create_mirror_view(this->p_cell_out);
+
+  // Initialize with index-based test values
+  for (int ib = 0; ib < nblks_e; ++ib) {
+    for (int ik = 0; ik < nlev; ++ik) {
+      for (int ic = 0; ic < nproma; ++ic) {
+        // Value depends on all three indices
+        p_edge_in_h[edge_at(ic, ik, ib)] = static_cast<TypeParam>(1.0 + ic + ik * 0.1 + ib * 0.01);
+      }
+    }
+  }
 
+  // Initialize cell connectivity indices with specific pattern
+  for (int ib = 0; ib < nblks_c; ++ib) {
+    for (int ic = 0; ic < nproma; ++ic) {
+      // Each cell connects to 3 edges
+      for (int j = 0; j < 3; ++j) {
+        // Edge indices with a pattern
+        edge_idx_h[idx_at(ic, ib, j)] = (ic + j) % nproma;
+        edge_blk_h[blk_at(ic, ib, j)] = (ib + j % 2) % nblks_e;
+        
+        // Interpolation coefficients that depend on indices
+        coeff_int_cells_h[coeff_at(ic, j, ib)] = static_cast<TypeParam>(1.0 / 3.0 + j * 0.01);
+      }
+      
+      // Initialize output to zero
+      for (int ik = 0; ik < nlev; ++ik) {
+        p_cell_out_h[cell_at(ic, ik, ib)] = static_cast<TypeParam>(0.0);
+      }
+    }
+  }
+
+  // Copy to device
+  Kokkos::deep_copy(this->p_edge_in, p_edge_in_h);
+  Kokkos::deep_copy(this->edge_idx, edge_idx_h);
+  Kokkos::deep_copy(this->edge_blk, edge_blk_h);
+  Kokkos::deep_copy(this->coeff_int_cells, coeff_int_cells_h);
+  Kokkos::deep_copy(this->p_cell_out, p_cell_out_h);
+
+  // Call the function under test
+  edges2cells_scalar_lib<TypeParam>(
+      this->p_edge_in.data(), this->edge_idx.data(),
+      this->edge_blk.data(), this->coeff_int_cells.data(),
+      this->p_cell_out.data(), this->i_startblk, this->i_endblk,
+      this->i_startidx, this->i_endidx, this->slev, this->elev,
+      nproma, nlev, nblks_e, nblks_c, this->lacc);
+
+  // Copy results back to host
+  Kokkos::deep_copy(p_cell_out_h, this->p_cell_out);
+
+  // Expected results based on the specific test values
+  std::vector<TypeParam> expected_cells(12);
+  int idx = 0;
+  std::generate(expected_cells.begin(), expected_cells.end(), [&idx]() {
+    TypeParam values[] = {
+      1.37677, 1.7201, 1.47977, 1.8231, 1.58277, 1.9261,
+      1.3802, 1.72353, 1.4832, 1.82653, 1.5862, 1.92953
+    };
+    return values[idx++];
+  });
+
+  // Verify results
+  for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
+    for (int jk = this->slev; jk <= this->elev; ++jk) {
+      for (int jc = this->i_startidx; jc <= this->i_endidx; ++jc) {
+        EXPECT_NEAR(p_cell_out_h[cell_at(jc, jk, jb)], 
+                    expected_cells[cell_at(jc, jk, jb)],
+                    static_cast<TypeParam>(1e-5))
+            << "Failure at block " << jb << ", level " << jk << ", index " << jc;
+      }
+    }
+  }
+}
+
+TYPED_TEST(InterpolationScalarSingleParamTest, Edges2CellsRandom) {
+  constexpr int nproma = this->nproma;
+  constexpr int nlev = this->nlev;
+  constexpr int nblks_e = this->nblks_e;
+  constexpr int nblks_c = this->nblks_c;
+
+  // Define indexing helpers
+  const auto &edge_at = at<nproma, nlev, nblks_e>;
+  const auto &idx_at = at<nproma, nblks_c, 3>;
+  const auto &blk_at = at<nproma, nblks_c, 3>;
+  const auto &coeff_at = at<nproma, 3, nblks_c>;
+  const auto &cell_at = at<nproma, nlev, nblks_c>;
+
+  // Create host mirror views
+  auto p_edge_in_h = Kokkos::create_mirror_view(this->p_edge_in);
+  auto edge_idx_h = Kokkos::create_mirror_view(this->edge_idx);
+  auto edge_blk_h = Kokkos::create_mirror_view(this->edge_blk);
+  auto coeff_int_cells_h = Kokkos::create_mirror_view(this->coeff_int_cells);
+  auto p_cell_out_h = Kokkos::create_mirror_view(this->p_cell_out);
+
+  // Use fixed seed for reproducibility
+  std::mt19937 gen(42);
+  std::uniform_int_distribution<int> edge_distrib(0, nproma - 1);
+  std::uniform_int_distribution<int> block_distrib(0, nblks_e - 1);
+  std::uniform_real_distribution<TypeParam> real_distrib(0.01, 1.0);
+
+  // Initialize with random values
+  for (int ib = 0; ib < nblks_e; ++ib) {
+    for (int ik = 0; ik < nlev; ++ik) {
+      for (int ic = 0; ic < nproma; ++ic) {
+        p_edge_in_h[edge_at(ic, ik, ib)] = real_distrib(gen);
+      }
+    }
+  }
+
+  // Initialize cell connectivity indices with random values
+  for (int ib = 0; ib < nblks_c; ++ib) {
+    for (int ic = 0; ic < nproma; ++ic) {
+      // Each cell connects to 3 edges
+      for (int j = 0; j < 3; ++j) {
+        edge_idx_h[idx_at(ic, ib, j)] = edge_distrib(gen);
+        edge_blk_h[blk_at(ic, ib, j)] = block_distrib(gen);
+        
+        // Random interpolation coefficients
+        coeff_int_cells_h[coeff_at(ic, j, ib)] = real_distrib(gen) / 3.0; // Scaled to ensure reasonable sums
+      }
+      
+      // Initialize output to zero
+      for (int ik = 0; ik < nlev; ++ik) {
+        p_cell_out_h[cell_at(ic, ik, ib)] = static_cast<TypeParam>(0.0);
+      }
+    }
+  }
+
+  // Copy to device
+  Kokkos::deep_copy(this->p_edge_in, p_edge_in_h);
+  Kokkos::deep_copy(this->edge_idx, edge_idx_h);
+  Kokkos::deep_copy(this->edge_blk, edge_blk_h);
+  Kokkos::deep_copy(this->coeff_int_cells, coeff_int_cells_h);
+  Kokkos::deep_copy(this->p_cell_out, p_cell_out_h);
+
+  // Call the function under test
   edges2cells_scalar_lib<TypeParam>(
-      this->p_edge_in.data(), this->edge_idx.data(), this->edge_blk.data(),
-      this->coeff_int_cells.data(), this->p_cell_out.data(), this->i_startblk,
-      this->i_endblk, this->i_startidx, this->i_endidx, this->slev, this->elev,
-      this->nproma, this->nlev, this->nblks_e, this->nblks_c, this->lacc);
-
-  // Check the outputs only for blocks in the range
-  // { [i_startblk, i_endblk], [slev,elev], [i_startidx_in, i_endidx_in] }
-  for (int block = this->i_startblk; block <= this->i_endblk; ++block) {
-    for (int level = this->slev; level < this->elev; ++level) {
-      for (int i = this->i_startidx; i < this->i_endidx; ++i) {
-        // Compute the linear index for a 3D array in column-major order:
-        size_t idx =
-            i + level * this->nproma + block * this->nproma * this->nlev;
-        // Since every contribution is 1 and there are 3 stencil points,
-        // expect 3.
-        EXPECT_NEAR(this->p_cell_out[idx], static_cast<TypeParam>(3),
+      this->p_edge_in.data(), this->edge_idx.data(),
+      this->edge_blk.data(), this->coeff_int_cells.data(),
+      this->p_cell_out.data(), this->i_startblk, this->i_endblk,
+      this->i_startidx, this->i_endidx, this->slev, this->elev,
+      nproma, nlev, nblks_e, nblks_c, this->lacc);
+
+  // Copy results back to host
+  Kokkos::deep_copy(p_cell_out_h, this->p_cell_out);
+
+  // Prepare expected results storage
+  using host_space = Kokkos::HostSpace;
+  Kokkos::View<TypeParam***, host_space> expected_cells("expected_cells", nproma, nlev, nblks_c);
+
+  // Compute expected values
+  for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
+    for (int jk = this->slev; jk <= this->elev; ++jk) {
+      for (int jc = this->i_startidx; jc <= this->i_endidx; ++jc) {
+        expected_cells(jc, jk, jb) = 0.0;
+        
+        for (int j = 0; j < 3; ++j) {
+          int edge_index = edge_idx_h[idx_at(jc, jb, j)];
+          int edge_block = edge_blk_h[blk_at(jc, jb, j)];
+          TypeParam coeff = coeff_int_cells_h[coeff_at(jc, j, jb)];
+          
+          expected_cells(jc, jk, jb) += coeff * p_edge_in_h[edge_at(edge_index, jk, edge_block)];
+        }
+      }
+    }
+  }
+
+  // Verify results
+  for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
+    for (int jk = this->slev; jk <= this->elev; ++jk) {
+      for (int jc = this->i_startidx; jc <= this->i_endidx; ++jc) {
+        EXPECT_NEAR(p_cell_out_h[cell_at(jc, jk, jb)], 
+                    expected_cells(jc, jk, jb), 
+                    static_cast<TypeParam>(1e-5))
+            << "Failure at block " << jb << ", level " << jk << ", index " << jc;
+      }
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// ! verts2cells
+//
+////////////////////////////////////////////////////////////////////////////////
+
+TYPED_TEST(InterpolationScalarSingleParamTest, Verts2CellsSpecific) {
+  constexpr int nproma = this->nproma;
+  constexpr int nlev = this->nlev;
+  constexpr int nblks_v = this->nblks_v;
+  constexpr int nblks_c = this->nblks_c;
+  constexpr int npromz_c = this->npromz_c;
+
+  // Define indexing helpers
+  const auto &vert_at = at<nproma, nlev, nblks_v>;
+  const auto &idx_at = at<nproma, nblks_c, 3>;
+  const auto &blk_at = at<nproma, nblks_c, 3>;
+  const auto &coeff_at = at<nproma, 3, nblks_c>;
+  const auto &cell_at = at<nproma, nlev, nblks_c>;
+
+  // Create host mirror views
+  auto p_vert_in_h = Kokkos::create_mirror_view(this->p_vert_in);
+  auto cell_index_idx_h = Kokkos::create_mirror_view(this->cell_index_idx);
+  auto cell_index_blk_h = Kokkos::create_mirror_view(this->cell_index_blk);
+  auto coeff_int_cells_h = Kokkos::create_mirror_view(this->coeff_int_cells);
+  auto p_cell_out_h = Kokkos::create_mirror_view(this->p_cell_out);
+
+  // Initialize with index-based test values
+  for (int ib = 0; ib < nblks_v; ++ib) {
+    for (int ik = 0; ik < nlev; ++ik) {
+      for (int ic = 0; ic < nproma; ++ic) {
+        // Value depends on all three indices
+        p_vert_in_h[vert_at(ic, ik, ib)] = static_cast<TypeParam>(1.0 + ic + ik * 0.1 + ib * 0.01);
+      }
+    }
+  }
+
+  // Initialize cell connectivity indices with specific pattern
+  for (int ib = 0; ib < nblks_c; ++ib) {
+    for (int ic = 0; ic < nproma; ++ic) {
+      // Each cell connects to 3 vertices
+      for (int j = 0; j < 3; ++j) {
+        // Vertex indices with a pattern
+        cell_index_idx_h[idx_at(ic, ib, j)] = (ic + j) % nproma;
+        cell_index_blk_h[blk_at(ic, ib, j)] = (ib + j % 2) % nblks_v;
+        
+        // Interpolation coefficients that depend on indices
+        coeff_int_cells_h[coeff_at(ic, j, ib)] = static_cast<TypeParam>(1.0 / 3.0 + j * 0.01);
+      }
+      
+      // Initialize output to zero
+      for (int ik = 0; ik < nlev; ++ik) {
+        p_cell_out_h[cell_at(ic, ik, ib)] = static_cast<TypeParam>(0.0);
+      }
+    }
+  }
+
+  // Copy to device
+  Kokkos::deep_copy(this->p_vert_in, p_vert_in_h);
+  Kokkos::deep_copy(this->cell_index_idx, cell_index_idx_h);
+  Kokkos::deep_copy(this->cell_index_blk, cell_index_blk_h);
+  Kokkos::deep_copy(this->coeff_int_cells, coeff_int_cells_h);
+  Kokkos::deep_copy(this->p_cell_out, p_cell_out_h);
+
+  // Call the function under test
+  verts2cells_scalar_lib<TypeParam>(
+      this->p_vert_in.data(), this->cell_index_idx.data(),
+      this->cell_index_blk.data(), this->coeff_int_cells.data(),
+      this->p_cell_out.data(), nblks_c, npromz_c, this->slev, this->elev,
+      nproma, nlev, nblks_v, this->lacc);
+
+  // Copy results back to host
+  Kokkos::deep_copy(p_cell_out_h, this->p_cell_out);
+
+  // Expected results based on the specific test values
+  std::vector<TypeParam> expected_cells(12);
+  int idx = 0;
+  std::generate(expected_cells.begin(), expected_cells.end(), [&idx]() {
+    TypeParam values[] = {
+      1.37677, 1.7201, 1.47977, 1.8231, 1.58277, 1.9261,
+      1.3802, 1.72353, 1.4832, 1.82653, 1.5862, 1.92953
+    };
+    return values[idx++];
+  });
+
+  // Verify results - check the same ranges as in the expected calculation
+  for (int jb = 0; jb < nblks_c; ++jb) {
+    for (int jk = this->slev; jk <= this->elev; ++jk) {
+      int nlen = (jb != nblks_c - 1) ? nproma : npromz_c;
+      int start_idx = (jb >= this->i_startblk && jb <= this->i_endblk) ? this->i_startidx : 0;
+      int end_idx = (jb >= this->i_startblk && jb <= this->i_endblk) ? this->i_endidx : nlen - 1;
+      
+      for (int jc = start_idx; jc <= end_idx; ++jc) {
+        EXPECT_NEAR(p_cell_out_h[cell_at(jc, jk, jb)], 
+                    expected_cells[cell_at(jc, jk, jb)],
                     static_cast<TypeParam>(1e-5))
-            << "Failure at block " << block << ", level " << level << ", index "
-            << i;
+            << "Failure at block " << jb << ", level " << jk << ", index " << jc;
       }
     }
   }
 }
 
-TYPED_TEST(InterpolationScalarTypedTestFixture, Verts2Cells) {
+TYPED_TEST(InterpolationScalarSingleParamTest, Verts2CellsRandom) {
+  constexpr int nproma = this->nproma;
+  constexpr int nlev = this->nlev;
+  constexpr int nblks_v = this->nblks_v;
+  constexpr int nblks_c = this->nblks_c;
+  constexpr int npromz_c = this->npromz_c;
+
+  // Define indexing helpers
+  const auto &vert_at = at<nproma, nlev, nblks_v>;
+  const auto &idx_at = at<nproma, nblks_c, 3>;
+  const auto &blk_at = at<nproma, nblks_c, 3>;
+  const auto &coeff_at = at<nproma, 3, nblks_c>;
+  const auto &cell_at = at<nproma, nlev, nblks_c>;
+
+  // Create host mirror views
+  auto p_vert_in_h = Kokkos::create_mirror_view(this->p_vert_in);
+  auto cell_index_idx_h = Kokkos::create_mirror_view(this->cell_index_idx);
+  auto cell_index_blk_h = Kokkos::create_mirror_view(this->cell_index_blk);
+  auto coeff_int_cells_h = Kokkos::create_mirror_view(this->coeff_int_cells);
+  auto p_cell_out_h = Kokkos::create_mirror_view(this->p_cell_out);
+
+  // Use fixed seed for reproducibility
+  std::mt19937 gen(42);
+  std::uniform_int_distribution<int> vert_distrib(0, nproma - 1);
+  std::uniform_int_distribution<int> block_distrib(0, nblks_v - 1);
+  std::uniform_real_distribution<TypeParam> real_distrib(0.01, 1.0);
+
+  // Initialize with random values
+  for (int ib = 0; ib < nblks_v; ++ib) {
+    for (int ik = 0; ik < nlev; ++ik) {
+      for (int ic = 0; ic < nproma; ++ic) {
+        p_vert_in_h[vert_at(ic, ik, ib)] = real_distrib(gen);
+      }
+    }
+  }
+
+  // Initialize cell connectivity indices with random values
+  for (int ib = 0; ib < nblks_c; ++ib) {
+    for (int ic = 0; ic < nproma; ++ic) {
+      // Each cell connects to 3 vertices
+      for (int j = 0; j < 3; ++j) {
+        cell_index_idx_h[idx_at(ic, ib, j)] = vert_distrib(gen);
+        cell_index_blk_h[blk_at(ic, ib, j)] = block_distrib(gen);
+        
+        // Random interpolation coefficients
+        coeff_int_cells_h[coeff_at(ic, j, ib)] = real_distrib(gen) / 3.0; // Scaled to ensure reasonable sums
+      }
+      
+      // Initialize output to zero
+      for (int ik = 0; ik < nlev; ++ik) {
+        p_cell_out_h[cell_at(ic, ik, ib)] = static_cast<TypeParam>(0.0);
+      }
+    }
+  }
+
+  // Copy to device
+  Kokkos::deep_copy(this->p_vert_in, p_vert_in_h);
+  Kokkos::deep_copy(this->cell_index_idx, cell_index_idx_h);
+  Kokkos::deep_copy(this->cell_index_blk, cell_index_blk_h);
+  Kokkos::deep_copy(this->coeff_int_cells, coeff_int_cells_h);
+  Kokkos::deep_copy(this->p_cell_out, p_cell_out_h);
 
+  // Call the function under test
   verts2cells_scalar_lib<TypeParam>(
       this->p_vert_in.data(), this->cell_index_idx.data(),
       this->cell_index_blk.data(), this->coeff_int_cells.data(),
-      this->p_cell_out.data(), this->nblks_c, this->npromz_c, this->slev,
-      this->elev, this->nproma, this->nlev, this->nblks_v, this->lacc);
-
-  // Check the outputs only for blocks in the range
-  // { [i_startblk, i_endblk], [slev,elev], [i_startidx_in, i_endidx_in] }
-  for (int block = this->i_startblk; block <= this->i_endblk; ++block) {
-    for (int level = this->slev; level < this->elev; ++level) {
-      for (int i = this->i_startidx; i < this->i_endidx; ++i) {
-        // Compute the linear index for a 3D array in column-major order:
-        size_t idx =
-            i + level * this->nproma + block * this->nproma * this->nlev;
-        // Since every contribution is 1 and there are 3 stencil points,
-        // expect 3.
-        EXPECT_NEAR(this->p_cell_out[idx], static_cast<TypeParam>(3),
+      this->p_cell_out.data(), nblks_c, npromz_c, this->slev, this->elev,
+      nproma, nlev, nblks_v, this->lacc);
+
+  // Copy results back to host
+  Kokkos::deep_copy(p_cell_out_h, this->p_cell_out);
+
+  // Prepare expected results storage
+  using host_space = Kokkos::HostSpace;
+  Kokkos::View<TypeParam***, host_space> expected_cells("expected_cells", nproma, nlev, nblks_c);
+
+  // Compute expected values
+  for (int jb = 0; jb < nblks_c; ++jb) {
+    for (int jk = this->slev; jk <= this->elev; ++jk) {
+      int nlen = (jb != nblks_c - 1) ? nproma : npromz_c;
+      for (int jc = 0; jc < nlen; ++jc) {
+        expected_cells(jc, jk, jb) = 0.0;
+        
+        for (int j = 0; j < 3; ++j) {
+          int vert_index = cell_index_idx_h[idx_at(jc, jb, j)];
+          int vert_block = cell_index_blk_h[blk_at(jc, jb, j)];
+          TypeParam coeff = coeff_int_cells_h[coeff_at(jc, j, jb)];
+          
+          expected_cells(jc, jk, jb) += coeff * p_vert_in_h[vert_at(vert_index, jk, vert_block)];
+        }
+      }
+    }
+  }
+
+  // Verify results
+  for (int jb = 0; jb < nblks_c; ++jb) {
+    for (int jk = this->slev; jk <= this->elev; ++jk) {
+      int nlen = (jb != nblks_c - 1) ? nproma : npromz_c;
+      for (int jc = 0; jc < nlen; ++jc) {
+        EXPECT_NEAR(p_cell_out_h[cell_at(jc, jk, jb)], 
+                    expected_cells(jc, jk, jb), 
                     static_cast<TypeParam>(1e-5))
-            << "Failure at block " << block << ", level " << level << ", index "
-            << i;
+            << "Failure at block " << jb << ", level " << jk << ", index " << jc;
       }
     }
   }
@@ -289,48 +967,229 @@ TYPED_TEST(InterpolationScalarTypedTestFixture, Verts2Cells) {
 //
 ////////////////////////////////////////////////////////////////////////////////
 
-TYPED_TEST(InterpolationScalarTypedTestFixture, AvgLib) {
+TYPED_TEST(InterpolationScalarSingleParamTest, CellAvgLibSpecific) {
+  constexpr int nproma = this->nproma;
+  constexpr int nlev = this->nlev;
+  constexpr int nblks_c = this->nblks_c;
+
+  // Define indexing helpers
+  const auto &psi_at = at<nproma, nlev, nblks_c>;
+  const auto &idx_at = at<nproma, nblks_c, 3>;
+  const auto &blk_at = at<nproma, nblks_c, 3>;
+  const auto &coeff_at = at<nproma, 4, nblks_c>;  // 4 coefficients (self + 3 neighbors)
+  const auto &avg_at = at<nproma, nlev, nblks_c>;
+
+  // Create host mirror views
+  auto psi_c_h = Kokkos::create_mirror_view(this->psi_c);
+  auto cell_neighbor_idx_h = Kokkos::create_mirror_view(this->cell_neighbor_idx);
+  auto cell_neighbor_blk_h = Kokkos::create_mirror_view(this->cell_neighbor_blk);
+  auto avg_coeff_h = Kokkos::create_mirror_view(this->avg_coeff);
+  auto avg_psi_c_h = Kokkos::create_mirror_view(this->avg_psi_c);
+
+  // Initialize with index-based test values
+  for (int ib = 0; ib < nblks_c; ++ib) {
+    for (int ik = 0; ik < nlev; ++ik) {
+      for (int ic = 0; ic < nproma; ++ic) {
+        // Value depends on all three indices
+        psi_c_h[psi_at(ic, ik, ib)] = static_cast<TypeParam>(1.0 + ic + ik * 0.1 + ib * 0.01);
+      }
+    }
+  }
 
-  // Call the function
-  cell_avg_lib<TypeParam>(this->psi_c.data(), this->cell_neighbor_idx.data(),
-                          this->cell_neighbor_blk.data(),
-                          this->avg_coeff.data(), this->avg_psi_c.data(),
-                          this->i_startblk, this->i_endblk, this->i_startidx,
-                          this->i_endidx, this->slev, this->elev, this->nproma,
-                          this->nlev, this->nblks_c, this->lacc);
-
-  // Check the outputs only for blocks in the range
-  // { [i_startblk, i_endblk], [slev,elev], [i_startidx_in, i_endidx_in] }
-  for (int block = this->i_startblk; block <= this->i_endblk; ++block) {
-    for (int level = this->slev; level < this->elev; ++level) {
-      for (int i = this->i_startidx; i < this->i_endidx; ++i) {
-        // Compute the linear index for a 3D array in column-major order:
-        size_t idx =
-            i + level * this->nproma + block * this->nproma * this->nlev;
-        // Since every contribution is 1 and there are 4 stencil points,
-        // expect 4.
-        EXPECT_NEAR(this->avg_psi_c[idx], static_cast<TypeParam>(4),
+  // Initialize cell neighbor indices with specific pattern
+  for (int ib = 0; ib < nblks_c; ++ib) {
+    for (int ic = 0; ic < nproma; ++ic) {
+      // Each cell has 3 neighboring cells
+      for (int j = 0; j < 3; ++j) {
+        // Neighbor indices with a pattern
+        cell_neighbor_idx_h[idx_at(ic, ib, j)] = (ic + j + 1) % nproma;
+        cell_neighbor_blk_h[blk_at(ic, ib, j)] = (ib + j % 2) % nblks_c;
+      }
+      
+      // Averaging coefficients - one for the cell itself and one for each neighbor
+      avg_coeff_h[coeff_at(ic, 0, ib)] = static_cast<TypeParam>(0.4);  // Self weight
+      avg_coeff_h[coeff_at(ic, 1, ib)] = static_cast<TypeParam>(0.2);  // First neighbor
+      avg_coeff_h[coeff_at(ic, 2, ib)] = static_cast<TypeParam>(0.2);  // Second neighbor
+      avg_coeff_h[coeff_at(ic, 3, ib)] = static_cast<TypeParam>(0.2);  // Third neighbor
+      
+      // Initialize output to zero
+      for (int ik = 0; ik < nlev; ++ik) {
+        avg_psi_c_h[avg_at(ic, ik, ib)] = static_cast<TypeParam>(0.0);
+      }
+    }
+  }
+
+  // Copy to device
+  Kokkos::deep_copy(this->psi_c, psi_c_h);
+  Kokkos::deep_copy(this->cell_neighbor_idx, cell_neighbor_idx_h);
+  Kokkos::deep_copy(this->cell_neighbor_blk, cell_neighbor_blk_h);
+  Kokkos::deep_copy(this->avg_coeff, avg_coeff_h);
+  Kokkos::deep_copy(this->avg_psi_c, avg_psi_c_h);
+
+  // Call the function under test
+  cell_avg_lib<TypeParam>(
+      this->psi_c.data(), this->cell_neighbor_idx.data(),
+      this->cell_neighbor_blk.data(), this->avg_coeff.data(),
+      this->avg_psi_c.data(), this->i_startblk, this->i_endblk,
+      this->i_startidx, this->i_endidx, this->slev, this->elev,
+      nproma, nlev, nblks_c, this->lacc);
+
+  // Copy results back to host
+  Kokkos::deep_copy(avg_psi_c_h, this->avg_psi_c);
+
+  // Expected results based on the specific test values
+  std::vector<TypeParam> expected_avg(12);
+  int idx = 0;
+  std::generate(expected_avg.begin(), expected_avg.end(), [&idx]() {
+    TypeParam values[] = {
+      1.402, 1.602, 1.502, 1.702, 1.602, 1.802,
+      1.408, 1.608, 1.508, 1.708, 1.608, 1.808
+    };
+    return values[idx++];
+  });
+
+  // Verify results
+  for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
+    for (int jk = this->slev; jk <= this->elev; ++jk) {
+      for (int jc = this->i_startidx; jc <= this->i_endidx; ++jc) {
+        EXPECT_NEAR(avg_psi_c_h[avg_at(jc, jk, jb)], 
+                    expected_avg[avg_at(jc, jk, jb)],
                     static_cast<TypeParam>(1e-5))
-            << "Failure at block " << block << ", level " << level << ", index "
-            << i;
+            << "Failure at block " << jb << ", level " << jk << ", index " << jc;
+      }
+    }
+  }
+}
+
+TYPED_TEST(InterpolationScalarSingleParamTest, CellAvgLibRandom) {
+  constexpr int nproma = this->nproma;
+  constexpr int nlev = this->nlev;
+  constexpr int nblks_c = this->nblks_c;
+
+  // Define indexing helpers
+  const auto &psi_at = at<nproma, nlev, nblks_c>;
+  const auto &idx_at = at<nproma, nblks_c, 3>;
+  const auto &blk_at = at<nproma, nblks_c, 3>;
+  const auto &coeff_at = at<nproma, 4, nblks_c>;  // 4 coefficients (self + 3 neighbors)
+  const auto &avg_at = at<nproma, nlev, nblks_c>;
+
+  // Create host mirror views
+  auto psi_c_h = Kokkos::create_mirror_view(this->psi_c);
+  auto cell_neighbor_idx_h = Kokkos::create_mirror_view(this->cell_neighbor_idx);
+  auto cell_neighbor_blk_h = Kokkos::create_mirror_view(this->cell_neighbor_blk);
+  auto avg_coeff_h = Kokkos::create_mirror_view(this->avg_coeff);
+  auto avg_psi_c_h = Kokkos::create_mirror_view(this->avg_psi_c);
+
+  // Use fixed seed for reproducibility
+  std::mt19937 gen(42);
+  std::uniform_int_distribution<int> cell_distrib(0, nproma - 1);
+  std::uniform_int_distribution<int> block_distrib(0, nblks_c - 1);
+  std::uniform_real_distribution<TypeParam> real_distrib(0.01, 1.0);
+  std::uniform_real_distribution<TypeParam> coeff_distrib(0.01, 0.5);  // Keep coefficients reasonable
+
+  // Initialize with random values
+  for (int ib = 0; ib < nblks_c; ++ib) {
+    for (int ik = 0; ik < nlev; ++ik) {
+      for (int ic = 0; ic < nproma; ++ic) {
+        psi_c_h[psi_at(ic, ik, ib)] = real_distrib(gen);
+      }
+    }
+  }
+
+  // Initialize cell neighbor indices with random values
+  for (int ib = 0; ib < nblks_c; ++ib) {
+    for (int ic = 0; ic < nproma; ++ic) {
+      // Each cell has 3 neighboring cells
+      for (int j = 0; j < 3; ++j) {
+        cell_neighbor_idx_h[idx_at(ic, ib, j)] = cell_distrib(gen);
+        cell_neighbor_blk_h[blk_at(ic, ib, j)] = block_distrib(gen);
+      }
+      
+      avg_coeff_h[coeff_at(ic, 0, ib)] = static_cast<TypeParam>(coeff_distrib(gen));
+      avg_coeff_h[coeff_at(ic, 1, ib)] = static_cast<TypeParam>(coeff_distrib(gen));
+      avg_coeff_h[coeff_at(ic, 2, ib)] = static_cast<TypeParam>(coeff_distrib(gen));
+      avg_coeff_h[coeff_at(ic, 3, ib)] = static_cast<TypeParam>(coeff_distrib(gen));
+      
+      // Initialize output to zero
+      for (int ik = 0; ik < nlev; ++ik) {
+        avg_psi_c_h[avg_at(ic, ik, ib)] = static_cast<TypeParam>(0.0);
+      }
+    }
+  }
+
+  // Copy to device
+  Kokkos::deep_copy(this->psi_c, psi_c_h);
+  Kokkos::deep_copy(this->cell_neighbor_idx, cell_neighbor_idx_h);
+  Kokkos::deep_copy(this->cell_neighbor_blk, cell_neighbor_blk_h);
+  Kokkos::deep_copy(this->avg_coeff, avg_coeff_h);
+  Kokkos::deep_copy(this->avg_psi_c, avg_psi_c_h);
+
+  // Call the function under test
+  cell_avg_lib<TypeParam>(
+      this->psi_c.data(), this->cell_neighbor_idx.data(),
+      this->cell_neighbor_blk.data(), this->avg_coeff.data(),
+      this->avg_psi_c.data(), this->i_startblk, this->i_endblk,
+      this->i_startidx, this->i_endidx, this->slev, this->elev,
+      nproma, nlev, nblks_c, this->lacc);
+
+  // Copy results back to host
+  Kokkos::deep_copy(avg_psi_c_h, this->avg_psi_c);
+
+  // Prepare expected results storage
+  using host_space = Kokkos::HostSpace;
+  Kokkos::View<TypeParam***, host_space> expected_avg("expected_avg", nproma, nlev, nblks_c);
+
+  // Compute expected values
+  for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
+    for (int jk = this->slev; jk <= this->elev; ++jk) {
+      for (int jc = this->i_startidx; jc <= this->i_endidx; ++jc) {
+        // Self contribution
+        expected_avg(jc, jk, jb) = 
+            psi_c_h[psi_at(jc, jk, jb)] * avg_coeff_h[coeff_at(jc, 0, jb)];
+        
+        // Neighbor contributions
+        for (int j = 0; j < 3; ++j) {
+          int neighbor_idx = cell_neighbor_idx_h[idx_at(jc, jb, j)];
+          int neighbor_blk = cell_neighbor_blk_h[blk_at(jc, jb, j)];
+          TypeParam coeff = avg_coeff_h[coeff_at(jc, j+1, jb)];
+          
+          expected_avg(jc, jk, jb) += 
+              psi_c_h[psi_at(neighbor_idx, jk, neighbor_blk)] * coeff;
+        }
+      }
+    }
+  }
+
+  // Verify results
+  for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
+    for (int jk = this->slev; jk <= this->elev; ++jk) {
+      for (int jc = this->i_startidx; jc <= this->i_endidx; ++jc) {
+        EXPECT_NEAR(avg_psi_c_h[avg_at(jc, jk, jb)], 
+                    expected_avg(jc, jk, jb), 
+                    static_cast<TypeParam>(1e-5))
+            << "Failure at block " << jb << ", level " << jk << ", index " << jc;
       }
     }
   }
 }
 
 template <typename TypePair>
-class InterpolationScalarMixedTestFixture : public ::testing::Test,
+class InterpolationScalarDoubleParamTest : public ::testing::Test,
                                             public interp_dimensions {
-public:
+  protected:
   using InType = typename TypePair::in_type;
   using OutType = typename TypePair::out_type;
 
+  // Using Kokkos execution and memory spaces
+  using exec_space = Kokkos::DefaultExecutionSpace;
+  using memory_space = exec_space::memory_space;
+
   // Arrays used for cells2edges
-  std::vector<InType> p_cell_in;        // Dimensions: (nproma, nlev, nblks_c)
-  std::vector<int> edge_cell_idx;       // Dimensions: (nproma, nblks_e, 2)
-  std::vector<int> edge_cell_blk;       // Dimensions: (nproma, nblks_e, 2)
-  std::vector<OutType> coeff_int_edges; // Dimensions: (nproma, 2, nblks_e)
-  std::vector<OutType> p_edge_out;      // Dimensions: (nproma, nlev, nblks_e)
+  Kokkos::View<InType*, memory_space> p_cell_in;
+  Kokkos::View<int*, memory_space> edge_cell_idx;
+  Kokkos::View<int*, memory_space> edge_cell_blk;
+  Kokkos::View<OutType*, memory_space> coeff_int_edges;
+  Kokkos::View<OutType*, memory_space> p_edge_out;
 
   // Further parameters for cells2edges
   const int patch_id = 0;
@@ -342,41 +1201,31 @@ public:
   std::vector<int> i_endidx_in;   // Dimensions: (2)
 
   // Arrays used for cells2verts
-  std::vector<int> vert_cell_idx;       // Dimensions: (nproma, nblks_v, 6)
-  std::vector<int> vert_cell_blk;       // Dimensions: (nproma, nblks_v, 6)
-  std::vector<OutType> coeff_int_verts; // Dimensions: (nproma, 6, nblks_v)
-  std::vector<OutType> p_vert_out;      // Dimensions: (nproma, nlev, nblks_v)
-
-  InterpolationScalarMixedTestFixture() {
-    // Allocate and initialize arrays needed for cells2edges
-    p_cell_in.resize(num_elements_3d<InType>(nproma, nlev, nblks_c),
-                     static_cast<InType>(1));
-    edge_cell_idx.resize(num_elements_3d<int>(nproma, nblks_e, 2), 1);
-    edge_cell_blk.resize(num_elements_3d<int>(nproma, nblks_e, 2), 0);
-    coeff_int_edges.resize(num_elements_3d<InType>(nproma, 2, nblks_e),
-                           static_cast<OutType>(1));
-
-    p_edge_out.resize(num_elements_3d<OutType>(nproma, nlev, nblks_e),
-                      static_cast<OutType>(0));
-
+  Kokkos::View<int*, memory_space> vert_cell_idx;
+  Kokkos::View<int*, memory_space> vert_cell_blk;
+  Kokkos::View<OutType*, memory_space> coeff_int_verts;
+  Kokkos::View<OutType*, memory_space> p_vert_out;
+
+  InterpolationScalarDoubleParamTest()
+    : p_cell_in("p_cell_in", nproma * nlev * nblks_c),
+      edge_cell_idx("edge_cell_idx", nproma * nblks_e * 2),
+      edge_cell_blk("edge_cell_blk", nproma * nblks_e * 2),
+      coeff_int_edges("coeff_int_edges", nproma * 2 * nblks_e),
+      p_edge_out("p_edge_out", nproma * nlev * nblks_e),
+      vert_cell_idx("vert_cell_idx", nproma * nblks_v * 6),
+      vert_cell_blk("vert_cell_blk", nproma * nblks_v * 6),
+      coeff_int_verts("coeff_int_verts", nproma * 6 * nblks_v),
+      p_vert_out("p_vert_out", nproma * nlev * nblks_v)
+  {
     // Allocate neighbour indexes for cells2edges
     i_startblk_in.resize(2, i_startblk);
     i_endblk_in.resize(2, i_endblk);
     i_startidx_in.resize(2, i_startidx);
     i_endidx_in.resize(2, i_endidx);
-
-    // Allocate & Initialize arrays needed for cells2verts
-    vert_cell_idx.resize(num_elements_3d<int>(nproma, nblks_v, 6), 1);
-    vert_cell_blk.resize(num_elements_3d<int>(nproma, nblks_v, 6), 0);
-    coeff_int_verts.resize(num_elements_3d<InType>(nproma, 6, nblks_v),
-                           static_cast<OutType>(1));
-
-    p_vert_out.resize(num_elements_3d<OutType>(nproma, nlev, nblks_v),
-                      static_cast<OutType>(0));
   }
 };
 
-TYPED_TEST_SUITE(InterpolationScalarMixedTestFixture, MixedTypesSP2DP);
+TYPED_TEST_SUITE(InterpolationScalarDoubleParamTest, MixedTypesSP2DP);
 
 ////////////////////////////////////////////////////////////////////////////////
 //
@@ -384,34 +1233,237 @@ TYPED_TEST_SUITE(InterpolationScalarMixedTestFixture, MixedTypesSP2DP);
 //
 ////////////////////////////////////////////////////////////////////////////////
 
-TYPED_TEST(InterpolationScalarMixedTestFixture, Cells2Edges) {
+TYPED_TEST(InterpolationScalarDoubleParamTest, Cells2EdgesSpecific) {
   using InType = typename TestFixture::InType;
   using OutType = typename TestFixture::OutType;
+  
+  constexpr int nproma = this->nproma;
+  constexpr int nlev = this->nlev;
+  constexpr int nblks_c = this->nblks_c;
+  constexpr int nblks_e = this->nblks_e;
+
+  // Define indexing helpers
+  const auto &cell_at = at<nproma, nlev, nblks_c>;
+  const auto &edge_idx_at = at<nproma, nblks_e, 2>;
+  const auto &edge_blk_at = at<nproma, nblks_e, 2>;
+  const auto &coeff_at = at<nproma, 2, nblks_e>;
+  const auto &edge_at = at<nproma, nlev, nblks_e>;
+
+  // Create host mirror views
+  auto p_cell_in_h = Kokkos::create_mirror_view(this->p_cell_in);
+  auto edge_cell_idx_h = Kokkos::create_mirror_view(this->edge_cell_idx);
+  auto edge_cell_blk_h = Kokkos::create_mirror_view(this->edge_cell_blk);
+  auto coeff_int_edges_h = Kokkos::create_mirror_view(this->coeff_int_edges);
+  auto p_edge_out_h = Kokkos::create_mirror_view(this->p_edge_out);
+
+  // Initialize with index-based test values
+  for (int ib = 0; ib < nblks_c; ++ib) {
+    for (int ik = 0; ik < nlev; ++ik) {
+      for (int ic = 0; ic < nproma; ++ic) {
+        // Value depends on all three indices
+        p_cell_in_h[cell_at(ic, ik, ib)] = static_cast<InType>(1.0 + ic + ik * 0.1 + ib * 0.01);
+      }
+    }
+  }
 
-  // Call the function
+  // Initialize edge connectivity indices with specific pattern
+  for (int ib = 0; ib < nblks_e; ++ib) {
+    for (int ic = 0; ic < nproma; ++ic) {
+      // Each edge connects to 2 cells
+      edge_cell_idx_h[edge_idx_at(ic, ib, 0)] = ic % nproma;                // First cell index
+      edge_cell_idx_h[edge_idx_at(ic, ib, 1)] = (ic + 1) % nproma;          // Second cell index
+      
+      edge_cell_blk_h[edge_blk_at(ic, ib, 0)] = ib % nblks_c;               // First cell block
+      edge_cell_blk_h[edge_blk_at(ic, ib, 1)] = (ib + 1) % nblks_c;         // Second cell block
+      
+      // Interpolation coefficients that depend on indices
+      coeff_int_edges_h[coeff_at(ic, 0, ib)] = static_cast<OutType>(0.5 + ic * 0.01);
+      coeff_int_edges_h[coeff_at(ic, 1, ib)] = static_cast<OutType>(0.5 - ic * 0.01);
+      
+      // Initialize output to zero
+      for (int ik = 0; ik < nlev; ++ik) {
+        p_edge_out_h[edge_at(ic, ik, ib)] = static_cast<OutType>(0.0);
+      }
+    }
+  }
+
+  // Copy to device
+  Kokkos::deep_copy(this->p_cell_in, p_cell_in_h);
+  Kokkos::deep_copy(this->edge_cell_idx, edge_cell_idx_h);
+  Kokkos::deep_copy(this->edge_cell_blk, edge_cell_blk_h);
+  Kokkos::deep_copy(this->coeff_int_edges, coeff_int_edges_h);
+  Kokkos::deep_copy(this->p_edge_out, p_edge_out_h);
+
+  // Call the function under test
+  cells2edges_scalar_lib<InType, OutType>(
+      this->p_cell_in.data(), this->edge_cell_idx.data(),
+      this->edge_cell_blk.data(), this->coeff_int_edges.data(),
+      this->p_edge_out.data(), this->i_startblk_in.data(),
+      this->i_endblk_in.data(), this->i_startidx_in.data(),
+      this->i_endidx_in.data(), this->slev, this->elev, nproma,
+      nlev, nblks_c, nblks_e, this->patch_id,
+      this->l_limited_area, this->lfill_latbc, this->lacc);
+
+  // Copy results back to host
+  Kokkos::deep_copy(p_edge_out_h, this->p_edge_out);
+
+  int i_startblk = this->i_startblk_in[1];
+  int i_endblk = this->i_endblk_in[1];
+  int i_startidx_range = this->i_startidx_in[1];
+  int i_endidx_range = this->i_endidx_in[1];
+
+  // Expected results based on the specific test values
+  std::vector<OutType> expected_edges(12);
+  int idx = 0;
+  std::generate(expected_edges.begin(), expected_edges.end(), [&idx]() {
+    OutType values[] = {
+      1.505, 1.5149, 1.605, 1.6149, 1.705, 1.7149,
+      1.505, 1.5151, 1.605, 1.6151, 1.705, 1.7151
+    };
+    return values[idx++];
+  });
+
+  // Verify results
+  for (int jb = i_startblk; jb <= i_endblk; ++jb) {
+    int i_startidx, i_endidx;
+    get_indices_e_lib(i_startidx_range, i_endidx_range, nproma, jb,
+                      i_startblk, i_endblk, i_startidx, i_endidx);
+    
+    for (int jk = this->slev; jk <= this->elev; ++jk) {
+      for (int je = i_startidx; je <= i_endidx; ++je) {
+        EXPECT_NEAR(p_edge_out_h[edge_at(je, jk, jb)], 
+                    expected_edges[edge_at(je, jk, jb)],
+                    static_cast<OutType>(1e-5))
+            << "Failure at block " << jb << ", level " << jk << ", index " << je;
+      }
+    }
+  }
+}
+
+TYPED_TEST(InterpolationScalarDoubleParamTest, Cells2EdgesRandom) {
+  using InType = typename TestFixture::InType;
+  using OutType = typename TestFixture::OutType;
+  
+  constexpr int nproma = this->nproma;
+  constexpr int nlev = this->nlev;
+  constexpr int nblks_c = this->nblks_c;
+  constexpr int nblks_e = this->nblks_e;
+
+  // Define indexing helpers
+  const auto &cell_at = at<nproma, nlev, nblks_c>;
+  const auto &edge_idx_at = at<nproma, nblks_e, 2>;
+  const auto &edge_blk_at = at<nproma, nblks_e, 2>;
+  const auto &coeff_at = at<nproma, 2, nblks_e>;
+  const auto &edge_at = at<nproma, nlev, nblks_e>;
+
+  // Create host mirror views
+  auto p_cell_in_h = Kokkos::create_mirror_view(this->p_cell_in);
+  auto edge_cell_idx_h = Kokkos::create_mirror_view(this->edge_cell_idx);
+  auto edge_cell_blk_h = Kokkos::create_mirror_view(this->edge_cell_blk);
+  auto coeff_int_edges_h = Kokkos::create_mirror_view(this->coeff_int_edges);
+  auto p_edge_out_h = Kokkos::create_mirror_view(this->p_edge_out);
+
+  // Use fixed seed for reproducibility
+  std::mt19937 gen(42);
+  std::uniform_int_distribution<int> cell_distrib(0, nproma - 1);
+  std::uniform_int_distribution<int> block_distrib(0, nblks_c - 1);
+  std::uniform_real_distribution<double> real_distrib(0.01, 1.0);
+
+  // Initialize with random values
+  for (int ib = 0; ib < nblks_c; ++ib) {
+    for (int ik = 0; ik < nlev; ++ik) {
+      for (int ic = 0; ic < nproma; ++ic) {
+        p_cell_in_h[cell_at(ic, ik, ib)] = static_cast<InType>(real_distrib(gen));
+      }
+    }
+  }
+
+  // Initialize edge connectivity indices with random values
+  for (int ib = 0; ib < nblks_e; ++ib) {
+    for (int ic = 0; ic < nproma; ++ic) {
+      // Each edge connects to 2 cells
+      edge_cell_idx_h[edge_idx_at(ic, ib, 0)] = cell_distrib(gen);
+      edge_cell_idx_h[edge_idx_at(ic, ib, 1)] = cell_distrib(gen);
+      
+      edge_cell_blk_h[edge_blk_at(ic, ib, 0)] = block_distrib(gen);
+      edge_cell_blk_h[edge_blk_at(ic, ib, 1)] = block_distrib(gen);
+      
+      coeff_int_edges_h[coeff_at(ic, 0, ib)] = static_cast<OutType>(real_distrib(gen));
+      coeff_int_edges_h[coeff_at(ic, 1, ib)] = static_cast<OutType>(real_distrib(gen));
+      
+      // Initialize output to zero
+      for (int ik = 0; ik < nlev; ++ik) {
+        p_edge_out_h[edge_at(ic, ik, ib)] = static_cast<OutType>(0.0);
+      }
+    }
+  }
+
+  // Copy to device
+  Kokkos::deep_copy(this->p_cell_in, p_cell_in_h);
+  Kokkos::deep_copy(this->edge_cell_idx, edge_cell_idx_h);
+  Kokkos::deep_copy(this->edge_cell_blk, edge_cell_blk_h);
+  Kokkos::deep_copy(this->coeff_int_edges, coeff_int_edges_h);
+  Kokkos::deep_copy(this->p_edge_out, p_edge_out_h);
+
+  // Call the function under test
   cells2edges_scalar_lib<InType, OutType>(
       this->p_cell_in.data(), this->edge_cell_idx.data(),
       this->edge_cell_blk.data(), this->coeff_int_edges.data(),
       this->p_edge_out.data(), this->i_startblk_in.data(),
       this->i_endblk_in.data(), this->i_startidx_in.data(),
-      this->i_endidx_in.data(), this->slev, this->elev, this->nproma,
-      this->nlev, this->nblks_c, this->nblks_e, this->patch_id,
+      this->i_endidx_in.data(), this->slev, this->elev, nproma,
+      nlev, nblks_c, nblks_e, this->patch_id,
       this->l_limited_area, this->lfill_latbc, this->lacc);
 
-  // Check the outputs only for blocks in the range
-  // { [i_startblk, i_endblk], [slev,elev], [i_startidx_in, i_endidx_in] }
-  for (int block = this->i_startblk; block <= this->i_endblk; ++block) {
-    for (int level = this->slev; level < this->elev; ++level) {
-      for (int i = this->i_startidx; i < this->i_endidx; ++i) {
-        // Compute the linear index for a 3D array in column-major order:
-        size_t idx =
-            i + level * this->nproma + block * this->nproma * this->nlev;
-        // Since every contribution is 1 and there are 2 stencil points,
-        // expect 2.
-        EXPECT_NEAR(this->p_edge_out[idx], static_cast<OutType>(2),
+  // Copy results back to host
+  Kokkos::deep_copy(p_edge_out_h, this->p_edge_out);
+
+  // Prepare expected results storage
+  using host_space = Kokkos::HostSpace;
+  Kokkos::View<OutType***, host_space> expected_edges("expected_edges", nproma, nlev, nblks_e);
+
+  // Since we're not testing the lateral boundary condition filling
+  // (this->l_limited_area == false && this->lfill_latbc == false),
+  // we only need to check the blocks in i_startblk_in[1] to i_endblk_in[1]
+  int i_startblk = this->i_startblk_in[1];
+  int i_endblk = this->i_endblk_in[1];
+  int i_startidx_range = this->i_startidx_in[1];
+  int i_endidx_range = this->i_endidx_in[1];
+
+  // Compute expected values
+  for (int jb = i_startblk; jb <= i_endblk; ++jb) {
+    // Get the actual indices to process for this block
+    int i_startidx, i_endidx;
+    get_indices_e_lib(i_startidx_range, i_endidx_range, nproma, jb,
+                      i_startblk, i_endblk, i_startidx, i_endidx);
+    
+    for (int jk = this->slev; jk <= this->elev; ++jk) {
+      for (int je = i_startidx; je <= i_endidx; ++je) {
+        expected_edges(je, jk, jb) = 
+            static_cast<OutType>(coeff_int_edges_h[coeff_at(je, 0, jb)] * 
+                                p_cell_in_h[cell_at(edge_cell_idx_h[edge_idx_at(je, jb, 0)], 
+                                                  jk, 
+                                                  edge_cell_blk_h[edge_blk_at(je, jb, 0)])]) +
+            static_cast<OutType>(coeff_int_edges_h[coeff_at(je, 1, jb)] * 
+                                p_cell_in_h[cell_at(edge_cell_idx_h[edge_idx_at(je, jb, 1)], 
+                                                  jk, 
+                                                  edge_cell_blk_h[edge_blk_at(je, jb, 1)])]);
+      }
+    }
+  }
+
+  // Verify results
+  for (int jb = i_startblk; jb <= i_endblk; ++jb) {
+    int i_startidx, i_endidx;
+    get_indices_e_lib(i_startidx_range, i_endidx_range, nproma, jb,
+                      i_startblk, i_endblk, i_startidx, i_endidx);
+    
+    for (int jk = this->slev; jk <= this->elev; ++jk) {
+      for (int je = i_startidx; je <= i_endidx; ++je) {
+        EXPECT_NEAR(p_edge_out_h[edge_at(je, jk, jb)], 
+                    expected_edges(je, jk, jb), 
                     static_cast<OutType>(1e-5))
-            << "Failure at block " << block << ", level " << level << ", index "
-            << i;
+            << "Failure at block " << jb << ", level " << jk << ", index " << je;
       }
     }
   }
@@ -423,31 +1475,220 @@ TYPED_TEST(InterpolationScalarMixedTestFixture, Cells2Edges) {
 //
 ////////////////////////////////////////////////////////////////////////////////
 
-TYPED_TEST(InterpolationScalarMixedTestFixture, Cells2Verts) {
+TYPED_TEST(InterpolationScalarDoubleParamTest, Cells2VertsSpecific) {
   using InType = typename TestFixture::InType;
   using OutType = typename TestFixture::OutType;
+  
+  constexpr int nproma = this->nproma;
+  constexpr int nlev = this->nlev;
+  constexpr int nblks_c = this->nblks_c;
+  constexpr int nblks_v = this->nblks_v;
+
+  // Define indexing helpers
+  const auto &cell_at = at<nproma, nlev, nblks_c>;
+  const auto &idx_at = at<nproma, nblks_v, 6>;
+  const auto &blk_at = at<nproma, nblks_v, 6>;
+  const auto &coeff_at = at<nproma, 6, nblks_v>;
+  const auto &vert_at = at<nproma, nlev, nblks_v>;
+
+  // Create host mirror views
+  auto p_cell_in_h = Kokkos::create_mirror_view(this->p_cell_in);
+  auto vert_cell_idx_h = Kokkos::create_mirror_view(this->vert_cell_idx);
+  auto vert_cell_blk_h = Kokkos::create_mirror_view(this->vert_cell_blk);
+  auto coeff_int_verts_h = Kokkos::create_mirror_view(this->coeff_int_verts);
+  auto p_vert_out_h = Kokkos::create_mirror_view(this->p_vert_out);
+
+  // Initialize with index-based test values
+  for (int ib = 0; ib < nblks_c; ++ib) {
+    for (int ik = 0; ik < nlev; ++ik) {
+      for (int ic = 0; ic < nproma; ++ic) {
+        // Value depends on all three indices
+        p_cell_in_h[cell_at(ic, ik, ib)] = static_cast<InType>(1.0 + ic + ik * 0.1 + ib * 0.01);
+      }
+    }
+  }
 
+  // Initialize vertex connectivity indices with specific pattern
+  for (int ib = 0; ib < nblks_v; ++ib) {
+    for (int ic = 0; ic < nproma; ++ic) {
+      // Each vertex is connected to 6 cells
+      for (int j = 0; j < 6; ++j) {
+        // Cell indices with a pattern
+        vert_cell_idx_h[idx_at(ic, ib, j)] = (ic + j) % nproma;
+        vert_cell_blk_h[blk_at(ic, ib, j)] = (ib + j % nblks_c) % nblks_c;
+        
+        // Interpolation coefficients that depend on indices
+        coeff_int_verts_h[coeff_at(ic, j, ib)] = static_cast<OutType>(1.0 / 6.0 + j * 0.01);
+      }
+      
+      // Initialize output to zero
+      for (int ik = 0; ik < nlev; ++ik) {
+        p_vert_out_h[vert_at(ic, ik, ib)] = static_cast<OutType>(0.0);
+      }
+    }
+  }
+
+  // Copy to device
+  Kokkos::deep_copy(this->p_cell_in, p_cell_in_h);
+  Kokkos::deep_copy(this->vert_cell_idx, vert_cell_idx_h);
+  Kokkos::deep_copy(this->vert_cell_blk, vert_cell_blk_h);
+  Kokkos::deep_copy(this->coeff_int_verts, coeff_int_verts_h);
+  Kokkos::deep_copy(this->p_vert_out, p_vert_out_h);
+
+  // Call the function under test
   cells2verts_scalar_lib<InType, OutType>(
       this->p_cell_in.data(), this->vert_cell_idx.data(),
       this->vert_cell_blk.data(), this->coeff_int_verts.data(),
       this->p_vert_out.data(), this->i_startblk, this->i_endblk,
-      this->i_startidx, this->i_endidx, this->slev, this->elev, this->nproma,
-      this->nlev, this->nblks_c, this->nblks_v, this->lacc, this->acc_async);
+      this->i_startidx, this->i_endidx, this->slev, this->elev,
+      nproma, nlev, nblks_c, nblks_v, this->lacc, this->acc_async);
+
+  // Copy results back to host
+  Kokkos::deep_copy(p_vert_out_h, this->p_vert_out);
+
+  // Expected results based on the specific test values
+  std::vector<OutType> expected_verts(12);
+  int idx = 0;
+  std::generate(expected_verts.begin(), expected_verts.end(), [&idx]() {
+    OutType values[] = {
+      1.7459, 1.7159, 1.8609, 1.8309, 1.9759, 1.9459,
+      1.7456, 1.7156, 1.8606, 1.8306, 1.9756, 1.9456
+    };
+    return values[idx++];
+  });
+
+  // Verify results
+  for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
+    int i_startidx, i_endidx;
+    get_indices_v_lib(this->i_startidx, this->i_endidx, nproma, jb,
+                      this->i_startblk, this->i_endblk, i_startidx, i_endidx);
+    
+    for (int jk = this->slev; jk <= this->elev; ++jk) {
+      for (int jv = i_startidx; jv <= i_endidx; ++jv) {
+        EXPECT_NEAR(p_vert_out_h[vert_at(jv, jk, jb)], 
+                   expected_verts[vert_at(jv, jk, jb)],
+                   static_cast<OutType>(1e-5))
+            << "Failure at block " << jb << ", level " << jk << ", index " << jv;
+      }
+    }
+  }
+}
 
-  // Check the outputs only for blocks in the range
-  // { [i_startblk, i_endblk], [slev,elev], [i_startidx_in, i_endidx_in] }
-  for (int block = this->i_startblk; block <= this->i_endblk; ++block) {
-    for (int level = this->slev; level < this->elev; ++level) {
-      for (int i = this->i_startidx; i < this->i_endidx; ++i) {
-        // Compute the linear index for a 3D array in column-major order:
-        size_t idx =
-            i + level * this->nproma + block * this->nproma * this->nlev;
-        // Since every contribution is 1 and there are 6 stencil points,
-        // expect 6.
-        EXPECT_NEAR(this->p_vert_out[idx], static_cast<OutType>(6),
-                    static_cast<OutType>(1e-5))
-            << "Failure at block " << block << ", level " << level << ", index "
-            << i;
+TYPED_TEST(InterpolationScalarDoubleParamTest, Cells2VertsRandom) {
+  using InType = typename TestFixture::InType;
+  using OutType = typename TestFixture::OutType;
+  
+  constexpr int nproma = this->nproma;
+  constexpr int nlev = this->nlev;
+  constexpr int nblks_c = this->nblks_c;
+  constexpr int nblks_v = this->nblks_v;
+
+  // Define indexing helpers
+  const auto &cell_at = at<nproma, nlev, nblks_c>;
+  const auto &idx_at = at<nproma, nblks_v, 6>;
+  const auto &blk_at = at<nproma, nblks_v, 6>;
+  const auto &coeff_at = at<nproma, 6, nblks_v>;
+  const auto &vert_at = at<nproma, nlev, nblks_v>;
+
+  // Create host mirror views
+  auto p_cell_in_h = Kokkos::create_mirror_view(this->p_cell_in);
+  auto vert_cell_idx_h = Kokkos::create_mirror_view(this->vert_cell_idx);
+  auto vert_cell_blk_h = Kokkos::create_mirror_view(this->vert_cell_blk);
+  auto coeff_int_verts_h = Kokkos::create_mirror_view(this->coeff_int_verts);
+  auto p_vert_out_h = Kokkos::create_mirror_view(this->p_vert_out);
+
+  // Use fixed seed for reproducibility
+  std::mt19937 gen(42);
+  std::uniform_int_distribution<int> cell_distrib(0, nproma - 1);
+  std::uniform_int_distribution<int> block_distrib(0, nblks_c - 1);
+  std::uniform_real_distribution<double> real_distrib(0.01, 1.0);
+  std::uniform_real_distribution<double> coeff_distrib(0.01, 0.3); // Keep coefficients reasonable
+
+  // Initialize with random values
+  for (int ib = 0; ib < nblks_c; ++ib) {
+    for (int ik = 0; ik < nlev; ++ik) {
+      for (int ic = 0; ic < nproma; ++ic) {
+        p_cell_in_h[cell_at(ic, ik, ib)] = static_cast<InType>(real_distrib(gen));
+      }
+    }
+  }
+
+  // Initialize vertex connectivity indices with random values
+  for (int ib = 0; ib < nblks_v; ++ib) {
+    for (int ic = 0; ic < nproma; ++ic) {
+      // Each vertex is connected to 6 cells
+      for (int j = 0; j < 6; ++j) {
+        vert_cell_idx_h[idx_at(ic, ib, j)] = cell_distrib(gen);
+        vert_cell_blk_h[blk_at(ic, ib, j)] = block_distrib(gen);
+        
+        // Normalized coefficients
+        coeff_int_verts_h[coeff_at(ic, j, ib)] = static_cast<OutType>(coeff_distrib(gen));
+      }
+      
+      // Initialize output to zero
+      for (int ik = 0; ik < nlev; ++ik) {
+        p_vert_out_h[vert_at(ic, ik, ib)] = static_cast<OutType>(0.0);
+      }
+    }
+  }
+
+  // Copy to device
+  Kokkos::deep_copy(this->p_cell_in, p_cell_in_h);
+  Kokkos::deep_copy(this->vert_cell_idx, vert_cell_idx_h);
+  Kokkos::deep_copy(this->vert_cell_blk, vert_cell_blk_h);
+  Kokkos::deep_copy(this->coeff_int_verts, coeff_int_verts_h);
+  Kokkos::deep_copy(this->p_vert_out, p_vert_out_h);
+
+  // Call the function under test
+  cells2verts_scalar_lib<InType, OutType>(
+      this->p_cell_in.data(), this->vert_cell_idx.data(),
+      this->vert_cell_blk.data(), this->coeff_int_verts.data(),
+      this->p_vert_out.data(), this->i_startblk, this->i_endblk,
+      this->i_startidx, this->i_endidx, this->slev, this->elev,
+      nproma, nlev, nblks_c, nblks_v, this->lacc, this->acc_async);
+
+  // Copy results back to host
+  Kokkos::deep_copy(p_vert_out_h, this->p_vert_out);
+
+  // Prepare expected results storage
+  using host_space = Kokkos::HostSpace;
+  Kokkos::View<OutType***, host_space> expected_verts("expected_verts", nproma, nlev, nblks_v);
+
+  // Compute expected values
+  for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
+    // Get the actual indices to process for this block
+    int i_startidx, i_endidx;
+    get_indices_v_lib(this->i_startidx, this->i_endidx, nproma, jb,
+                      this->i_startblk, this->i_endblk, i_startidx, i_endidx);
+    
+    for (int jk = this->slev; jk <= this->elev; ++jk) {
+      for (int jv = i_startidx; jv <= i_endidx; ++jv) {
+        expected_verts(jv, jk, jb) = static_cast<OutType>(0.0);
+        
+        for (int j = 0; j < 6; ++j) {
+          int cell_idx = vert_cell_idx_h[idx_at(jv, jb, j)];
+          int cell_blk = vert_cell_blk_h[blk_at(jv, jb, j)];
+          OutType coeff = coeff_int_verts_h[coeff_at(jv, j, jb)];
+          
+          expected_verts(jv, jk, jb) += 
+              static_cast<OutType>(coeff * p_cell_in_h[cell_at(cell_idx, jk, cell_blk)]);
+        }
+      }
+    }
+  }
+
+  // Verify results
+  for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
+    int i_startidx, i_endidx;
+    get_indices_v_lib(this->i_startidx, this->i_endidx, nproma, jb,
+                      this->i_startblk, this->i_endblk, i_startidx, i_endidx);
+    
+    for (int jk = this->slev; jk <= this->elev; ++jk) {
+      for (int jv = i_startidx; jv <= i_endidx; ++jv) {
+        EXPECT_NEAR(p_vert_out_h[vert_at(jv, jk, jb)], 
+                   expected_verts(jv, jk, jb), 
+                   static_cast<OutType>(1e-5))
+            << "Failure at block " << jb << ", level " << jk << ", index " << jv;
       }
     }
   }
@@ -460,7 +1701,7 @@ TYPED_TEST(InterpolationScalarMixedTestFixture, Cells2Verts) {
 ////////////////////////////////////////////////////////////////////////////////
 
 // The test for cells2verts_ri is similar to cells2verts, but is done here
-// separtely to avoid as a differebt template instantiation is needed for the
+// separtely to avoid as a different template instantiation is needed for the
 // function call
 template <typename Types>
 class Cells2vertsriScalarLibTestFixture : public testing::Test,
@@ -469,36 +1710,102 @@ public:
   using InType = typename Types::in_type;
   using OutType = typename Types::out_type;
 
+  // Using Kokkos execution and memory spaces
+  using exec_space = Kokkos::DefaultExecutionSpace;
+  using memory_space = exec_space::memory_space;
+
   // Arrays stored in std::vector.
-  std::vector<InType> p_cell_in;   // Dimensions: (nproma, nlev, nblks_c)
-  std::vector<int> vert_cell_idx;  // Dimensions: (nproma, nblks_v, 6)
-  std::vector<int> vert_cell_blk;  // Dimensions: (nproma, nblks_v, 6)
-  std::vector<InType> coeff_int;   // Dimensions: (nproma, 6, nblks_v)
-  std::vector<OutType> p_vert_out; // Dimensions: (nproma, nlev, nblks_v)
-
-  Cells2vertsriScalarLibTestFixture() {
-    // Allocate and initialize inputs.
-    p_cell_in.resize(num_elements_3d<InType>(nproma, nlev, nblks_c),
-                     static_cast<InType>(1));
-    vert_cell_idx.resize(num_elements_3d<int>(nproma, nblks_v, 6), 1);
-    vert_cell_blk.resize(num_elements_3d<int>(nproma, nblks_v, 6), 0);
-    coeff_int.resize(num_elements_3d<InType>(nproma, 6, nblks_v),
-                     static_cast<InType>(1));
-
-    // Allocate output arrays and initialize to zero.
-    p_vert_out.resize(num_elements_3d<OutType>(nproma, nlev, nblks_v),
-                      static_cast<OutType>(0));
-  }
+  Kokkos::View<InType*, memory_space> p_cell_in;
+  Kokkos::View<int*, memory_space> vert_cell_idx;
+  Kokkos::View<int*, memory_space> vert_cell_blk;
+  Kokkos::View<InType*, memory_space> coeff_int;
+  Kokkos::View<OutType*, memory_space> p_vert_out;
+
+  Cells2vertsriScalarLibTestFixture()
+    : p_cell_in("p_cell_in", nproma * nlev * nblks_c),
+      vert_cell_idx("vert_cell_idx", nproma * nblks_v * 6),
+      vert_cell_blk("vert_cell_blk", nproma * nblks_v * 6),
+      coeff_int("coeff_int", nproma * 6 * nblks_v),
+      p_vert_out("p_vert_out", nproma * nlev * nblks_v)   
+  {}
 };
 
 // Add test suite
 TYPED_TEST_SUITE(Cells2vertsriScalarLibTestFixture, MixedTypes);
 
 // Add test
-TYPED_TEST(Cells2vertsriScalarLibTestFixture, Cells2VertsRI) {
+TYPED_TEST(Cells2vertsriScalarLibTestFixture, Cells2VertsRISpecific) {
   using InType = typename TestFixture::InType;
   using OutType = typename TestFixture::OutType;
 
+  constexpr int nproma = this->nproma;
+  constexpr int nlev = this->nlev;
+  constexpr int nblks_c = this->nblks_c;
+  constexpr int nblks_v = this->nblks_v;
+
+  // Define indexing helpers
+  const auto &cell_at = at<nproma, nlev, nblks_c>;
+  const auto &idx_at = at<nproma, nblks_v, 6>;
+  const auto &blk_at = at<nproma, nblks_v, 6>;
+  const auto &coeff_at = at<nproma, 6, nblks_v>;
+
+  // For output, we need to handle different layouts depending on __LOOP_EXCHANGE
+  // This is a special case for this function
+#ifdef __LOOP_EXCHANGE
+  const auto &vert_at = at<nproma, nlev, nblks_c>;  // jv, jk, jb order
+#else
+  const auto &vert_at = at<nlev, nproma, nblks_c>;  // jk, jv, jb order
+#endif
+
+  // Create host mirror views
+  auto p_cell_in_h = Kokkos::create_mirror_view(this->p_cell_in);
+  auto vert_cell_idx_h = Kokkos::create_mirror_view(this->vert_cell_idx);
+  auto vert_cell_blk_h = Kokkos::create_mirror_view(this->vert_cell_blk);
+  auto coeff_int_h = Kokkos::create_mirror_view(this->coeff_int);
+  auto p_vert_out_h = Kokkos::create_mirror_view(this->p_vert_out);
+
+  // Initialize with index-based test values
+  for (int ib = 0; ib < nblks_c; ++ib) {
+    for (int ik = 0; ik < nlev; ++ik) {
+      for (int ic = 0; ic < nproma; ++ic) {
+        // Value depends on all three indices
+        p_cell_in_h[cell_at(ic, ik, ib)] = static_cast<InType>(1.0 + ic + ik * 0.1 + ib * 0.01);
+      }
+    }
+  }
+
+  // Initialize vertex connectivity indices with specific pattern
+  for (int ib = 0; ib < nblks_v; ++ib) {
+    for (int ic = 0; ic < nproma; ++ic) {
+      // Each vertex is connected to 6 cells
+      for (int j = 0; j < 6; ++j) {
+        // Cell indices with a pattern
+        vert_cell_idx_h[idx_at(ic, ib, j)] = (ic + j) % nproma;
+        vert_cell_blk_h[blk_at(ic, ib, j)] = (ib + j % nblks_c) % nblks_c;
+        
+        // Interpolation coefficients that depend on indices
+        coeff_int_h[coeff_at(ic, j, ib)] = static_cast<OutType>(1.0 / 6.0 + j * 0.01);
+      }
+      
+      // Initialize output to zero
+      for (int ik = 0; ik < nlev; ++ik) {
+        // Handle different indexing depending on __LOOP_EXCHANGE
+#ifdef __LOOP_EXCHANGE
+        p_vert_out_h[vert_at(ic, ik, ib)] = static_cast<OutType>(0.0);
+#else
+        p_vert_out_h[vert_at(ik, ic, ib)] = static_cast<OutType>(0.0);
+#endif
+      }
+    }
+  }
+
+  // Copy to device
+  Kokkos::deep_copy(this->p_cell_in, p_cell_in_h);
+  Kokkos::deep_copy(this->vert_cell_idx, vert_cell_idx_h);
+  Kokkos::deep_copy(this->vert_cell_blk, vert_cell_blk_h);
+  Kokkos::deep_copy(this->coeff_int, coeff_int_h);
+  Kokkos::deep_copy(this->p_vert_out, p_vert_out_h);
+
   // Call the function
   cells2verts_scalar_ri_lib<InType, OutType>(
       this->p_cell_in.data(), this->vert_cell_idx.data(),
@@ -507,25 +1814,195 @@ TYPED_TEST(Cells2vertsriScalarLibTestFixture, Cells2VertsRI) {
       this->i_startidx, this->i_endidx, this->slev, this->elev, this->nproma,
       this->nlev, this->nblks_c, this->nblks_v, this->lacc, this->acc_async);
 
-  // Check the outputs only for blocks in the range
-  // { [i_startblk, i_endblk], [slev,elev], [i_startidx_in, i_endidx_in] }
-  for (int block = this->i_startblk; block <= this->i_endblk; ++block) {
-    for (int level = this->slev; level < this->elev; ++level) {
-      for (int i = this->i_startidx; i < this->i_endidx; ++i) {
-        // Compute the linear index for a 3D array in column-major order:
+  // Copy results back to host
+  Kokkos::deep_copy(p_vert_out_h, this->p_vert_out);
+
+  // Expected results based on the specific test values
+  std::vector<OutType> expected_verts(12);
+  int idx = 0;
+  std::generate(expected_verts.begin(), expected_verts.end(), [&idx]() {
+    OutType values[] = {
 #ifdef __LOOP_EXCHANGE
-        size_t idx =
-            i + level * this->nproma + block * this->nproma * this->nlev;
+      1.7459, 1.7159, 1.8609, 1.8309, 1.9759, 1.9459,
+      1.7456, 1.7156, 1.8606, 1.8306, 1.9756, 1.9456
 #else
-        size_t idx = level + i * this->nlev + block * this->nproma * this->nlev;
+      1.7459, 1.8609, 1.9759, 1.7159, 1.8309, 1.9459,
+      1.7456, 1.8606, 1.9756, 1.7156, 1.8306, 1.9456
+#endif
+    };
+    return values[idx++];
+  });
+
+  std::cout << "p_vert_out_h: " << std::endl;
+  // print out the array p_vert_out_h in one line
+  for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
+    for (int jk = this->slev; jk <= this->elev; ++jk) {
+      for (int jv = this->i_startidx; jv <= this->i_endidx; ++jv) {
+        // std::cout << p_vert_out_h[vert_at(jk, jv, jb)] << ", ";
+        std::cout << p_vert_out_h[vert_at(jv, jk, jb)] << ", ";
+      }
+    }
+  }
+
+  // Verify results - using the appropriate indexing depending on __LOOP_EXCHANGE
+  for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
+    int i_startidx, i_endidx;
+    get_indices_v_lib(this->i_startidx, this->i_endidx, nproma, jb,
+                      this->i_startblk, this->i_endblk, i_startidx, i_endidx);
+    
+    for (int jk = this->slev; jk <= this->elev; ++jk) {
+      for (int jv = i_startidx; jv <= i_endidx; ++jv) {
+#ifdef __LOOP_EXCHANGE
+        EXPECT_NEAR(p_vert_out_h[vert_at(jv, jk, jb)], 
+                   expected_verts[vert_at(jv, jk, jb)],
+                   static_cast<OutType>(1e-5))
+            << "Failure at block " << jb << ", level " << jk << ", index " << jv;
+#else
+        EXPECT_NEAR(p_vert_out_h[vert_at(jk, jv, jb)], 
+                   expected_verts[vert_at(jk, jv, jb)],
+                   static_cast<OutType>(1e-5))
+            << "Failure at block " << jb << ", level " << jk << ", index " << jv;
 #endif
-        // Since every contribution is 1 and there are 6 stencil points,
-        // expect 6.
-        EXPECT_NEAR(this->p_vert_out[idx], static_cast<OutType>(6),
-                    static_cast<OutType>(1e-5))
-            << "Failure at block " << block << ", level " << level << ", index "
-            << i;
       }
     }
   }
 }
+
+TYPED_TEST(Cells2vertsriScalarLibTestFixture, Cells2VertsRIRandom) {
+  using InType = typename TestFixture::InType;
+  using OutType = typename TestFixture::OutType;
+  
+  constexpr int nproma = this->nproma;
+  constexpr int nlev = this->nlev;
+  constexpr int nblks_c = this->nblks_c;
+  constexpr int nblks_v = this->nblks_v;
+
+  // Define indexing helpers
+  const auto &cell_at = at<nproma, nlev, nblks_c>;
+  const auto &idx_at = at<nproma, nblks_v, 6>;
+  const auto &blk_at = at<nproma, nblks_v, 6>;
+  const auto &coeff_at = at<nproma, 6, nblks_v>;
+  
+  // For output, we need to handle different layouts depending on __LOOP_EXCHANGE
+#ifdef __LOOP_EXCHANGE
+  const auto &vert_at = at<nproma, nlev, nblks_v>;  // jv, jk, jb order
+#else
+  const auto &vert_at = at<nlev, nproma, nblks_v>;  // jk, jv, jb order
+#endif
+
+  // Create host mirror views
+  auto p_cell_in_h = Kokkos::create_mirror_view(this->p_cell_in);
+  auto vert_cell_idx_h = Kokkos::create_mirror_view(this->vert_cell_idx);
+  auto vert_cell_blk_h = Kokkos::create_mirror_view(this->vert_cell_blk);
+  auto coeff_int_h = Kokkos::create_mirror_view(this->coeff_int);
+  auto p_vert_out_h = Kokkos::create_mirror_view(this->p_vert_out);
+
+  // Use fixed seed for reproducibility
+  std::mt19937 gen(42);
+  std::uniform_int_distribution<int> cell_distrib(0, nproma - 1);
+  std::uniform_int_distribution<int> block_distrib(0, nblks_c - 1);
+  std::uniform_real_distribution<double> real_distrib(0.01, 1.0);
+  std::uniform_real_distribution<double> coeff_distrib(0.01, 0.3); // Keep coefficients reasonable
+
+  // Initialize with random values
+  for (int ib = 0; ib < nblks_c; ++ib) {
+    for (int ik = 0; ik < nlev; ++ik) {
+      for (int ic = 0; ic < nproma; ++ic) {
+        p_cell_in_h[cell_at(ic, ik, ib)] = static_cast<InType>(real_distrib(gen));
+      }
+    }
+  }
+
+  // Initialize vertex connectivity indices with random values
+  for (int ib = 0; ib < nblks_v; ++ib) {
+    for (int ic = 0; ic < nproma; ++ic) {
+      // Each vertex is connected to 6 cells
+      for (int j = 0; j < 6; ++j) {
+        vert_cell_idx_h[idx_at(ic, ib, j)] = cell_distrib(gen);
+        vert_cell_blk_h[blk_at(ic, ib, j)] = block_distrib(gen);
+        
+        // Normalized coefficients
+        coeff_int_h[coeff_at(ic, j, ib)] = static_cast<InType>(coeff_distrib(gen));
+      }
+      
+      // Initialize output to zero
+      for (int ik = 0; ik < nlev; ++ik) {
+        // Handle different indexing depending on __LOOP_EXCHANGE
+#ifdef __LOOP_EXCHANGE
+        p_vert_out_h[vert_at(ic, ik, ib)] = static_cast<OutType>(0.0);
+#else
+        p_vert_out_h[vert_at(ik, ic, ib)] = static_cast<OutType>(0.0);
+#endif
+      }
+    }
+  }
+
+  // Copy to device
+  Kokkos::deep_copy(this->p_cell_in, p_cell_in_h);
+  Kokkos::deep_copy(this->vert_cell_idx, vert_cell_idx_h);
+  Kokkos::deep_copy(this->vert_cell_blk, vert_cell_blk_h);
+  Kokkos::deep_copy(this->coeff_int, coeff_int_h);
+  Kokkos::deep_copy(this->p_vert_out, p_vert_out_h);
+
+  // Call the function
+  cells2verts_scalar_ri_lib<InType, OutType>(
+      this->p_cell_in.data(), this->vert_cell_idx.data(),
+      this->vert_cell_blk.data(), this->coeff_int.data(),
+      this->p_vert_out.data(), this->i_startblk, this->i_endblk,
+      this->i_startidx, this->i_endidx, this->slev, this->elev,
+      nproma, nlev, nblks_c, nblks_v, this->lacc, this->acc_async);
+
+  // Copy results back to host
+  Kokkos::deep_copy(p_vert_out_h, this->p_vert_out);
+
+  // Prepare expected results storage
+  using host_space = Kokkos::HostSpace;
+  Kokkos::View<OutType***, host_space> expected_verts("expected_verts", nproma, nlev, nblks_v);
+
+  // Compute expected values
+  for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
+    // Get the actual indices to process for this block
+    int i_startidx, i_endidx;
+    get_indices_v_lib(this->i_startidx, this->i_endidx, nproma, jb,
+                      this->i_startblk, this->i_endblk, i_startidx, i_endidx);
+    
+    for (int jk = this->slev; jk <= this->elev; ++jk) {
+      for (int jv = i_startidx; jv <= i_endidx; ++jv) {
+        expected_verts(jv, jk, jb) = static_cast<OutType>(0.0);
+        
+        for (int j = 0; j < 6; ++j) {
+          int cell_idx = vert_cell_idx_h[idx_at(jv, jb, j)];
+          int cell_blk = vert_cell_blk_h[blk_at(jv, jb, j)];
+          InType coeff = coeff_int_h[coeff_at(jv, j, jb)];
+          
+          expected_verts(jv, jk, jb) += 
+              static_cast<OutType>(coeff * p_cell_in_h[cell_at(cell_idx, jk, cell_blk)]);
+        }
+      }
+    }
+  }
+
+  // Verify results - using the appropriate indexing depending on __LOOP_EXCHANGE
+  for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
+    int i_startidx, i_endidx;
+    get_indices_v_lib(this->i_startidx, this->i_endidx, nproma, jb,
+                      this->i_startblk, this->i_endblk, i_startidx, i_endidx);
+    
+    for (int jk = this->slev; jk <= this->elev; ++jk) {
+      for (int jv = i_startidx; jv <= i_endidx; ++jv) {
+#ifdef __LOOP_EXCHANGE
+        EXPECT_NEAR(p_vert_out_h[vert_at(jv, jk, jb)], 
+                   expected_verts(jv, jk, jb), 
+                   static_cast<OutType>(1e-5))
+            << "Failure at block " << jb << ", level " << jk << ", index " << jv;
+#else
+        EXPECT_NEAR(p_vert_out_h[vert_at(jk, jv, jb)], 
+                   expected_verts(jv, jk, jb), 
+                   static_cast<OutType>(1e-5))
+            << "Failure at block " << jb << ", level " << jk << ", index " << jv;
+#endif
+      }
+    }
+  }
+}
+
-- 
GitLab


From e11d455517806132b15682d57d47623c3d074950 Mon Sep 17 00:00:00 2001
From: Pradipta Samanta <samanta@dkrz.de>
Date: Tue, 8 Apr 2025 18:28:45 +0200
Subject: [PATCH 19/34] fixed few bugs in mo_lib_intp_rbf

---
 src/interpolation/mo_lib_intp_rbf.cpp | 82 +++++++++++++--------------
 1 file changed, 41 insertions(+), 41 deletions(-)

diff --git a/src/interpolation/mo_lib_intp_rbf.cpp b/src/interpolation/mo_lib_intp_rbf.cpp
index d1178a6..ce6e238 100644
--- a/src/interpolation/mo_lib_intp_rbf.cpp
+++ b/src/interpolation/mo_lib_intp_rbf.cpp
@@ -180,62 +180,62 @@ void rbf_interpol_c2grad_lib(const T *p_cell_in, const int *rbf_c2grad_idx,
         "rbf_interpol_c2grad", innerPolicy,
         KOKKOS_LAMBDA(const int jk, const int jc) {
           grad_x_view(jc, jk, jb) =
-              rbf_c2grad_coeff_view(0, 1, jc, jb) * p_cell_in_view(jc, jk, jb) +
-              rbf_c2grad_coeff_view(1, 1, jc, jb) *
+              rbf_c2grad_coeff_view(0, 0, jc, jb) * p_cell_in_view(jc, jk, jb) +
+              rbf_c2grad_coeff_view(1, 0, jc, jb) *
                   p_cell_in_view(rbf_c2grad_idx_view(1, jc, jb), jk,
                                  rbf_c2grad_blk_view(1, jc, jb)) +
-              rbf_c2grad_coeff_view(2, 1, jc, jb) *
+              rbf_c2grad_coeff_view(2, 0, jc, jb) *
                   p_cell_in_view(rbf_c2grad_idx_view(2, jc, jb), jk,
                                  rbf_c2grad_blk_view(2, jc, jb)) +
-              rbf_c2grad_coeff_view(3, 1, jc, jb) *
+              rbf_c2grad_coeff_view(3, 0, jc, jb) *
                   p_cell_in_view(rbf_c2grad_idx_view(3, jc, jb), jk,
                                  rbf_c2grad_blk_view(3, jc, jb)) +
-              rbf_c2grad_coeff_view(4, 1, jc, jb) *
+              rbf_c2grad_coeff_view(4, 0, jc, jb) *
                   p_cell_in_view(rbf_c2grad_idx_view(4, jc, jb), jk,
                                  rbf_c2grad_blk_view(4, jc, jb)) +
-              rbf_c2grad_coeff_view(5, 1, jc, jb) *
+              rbf_c2grad_coeff_view(5, 0, jc, jb) *
                   p_cell_in_view(rbf_c2grad_idx_view(5, jc, jb), jk,
                                  rbf_c2grad_blk_view(5, jc, jb)) +
-              rbf_c2grad_coeff_view(6, 1, jc, jb) *
+              rbf_c2grad_coeff_view(6, 0, jc, jb) *
                   p_cell_in_view(rbf_c2grad_idx_view(6, jc, jb), jk,
                                  rbf_c2grad_blk_view(6, jc, jb)) +
-              rbf_c2grad_coeff_view(7, 1, jc, jb) *
+              rbf_c2grad_coeff_view(7, 0, jc, jb) *
                   p_cell_in_view(rbf_c2grad_idx_view(7, jc, jb), jk,
                                  rbf_c2grad_blk_view(7, jc, jb)) +
-              rbf_c2grad_coeff_view(8, 1, jc, jb) *
+              rbf_c2grad_coeff_view(8, 0, jc, jb) *
                   p_cell_in_view(rbf_c2grad_idx_view(8, jc, jb), jk,
                                  rbf_c2grad_blk_view(8, jc, jb)) +
-              rbf_c2grad_coeff_view(9, 1, jc, jb) *
+              rbf_c2grad_coeff_view(9, 0, jc, jb) *
                   p_cell_in_view(rbf_c2grad_idx_view(9, jc, jb), jk,
                                  rbf_c2grad_blk_view(9, jc, jb));
 
           grad_y_view(jc, jk, jb) =
-              rbf_c2grad_coeff_view(0, 2, jc, jb) * p_cell_in_view(jc, jk, jb) +
-              rbf_c2grad_coeff_view(1, 2, jc, jb) *
+              rbf_c2grad_coeff_view(0, 1, jc, jb) * p_cell_in_view(jc, jk, jb) +
+              rbf_c2grad_coeff_view(1, 1, jc, jb) *
                   p_cell_in_view(rbf_c2grad_idx_view(1, jc, jb), jk,
                                  rbf_c2grad_blk_view(1, jc, jb)) +
-              rbf_c2grad_coeff_view(2, 2, jc, jb) *
+              rbf_c2grad_coeff_view(2, 1, jc, jb) *
                   p_cell_in_view(rbf_c2grad_idx_view(2, jc, jb), jk,
                                  rbf_c2grad_blk_view(2, jc, jb)) +
-              rbf_c2grad_coeff_view(3, 2, jc, jb) *
+              rbf_c2grad_coeff_view(3, 1, jc, jb) *
                   p_cell_in_view(rbf_c2grad_idx_view(3, jc, jb), jk,
                                  rbf_c2grad_blk_view(3, jc, jb)) +
-              rbf_c2grad_coeff_view(4, 2, jc, jb) *
+              rbf_c2grad_coeff_view(4, 1, jc, jb) *
                   p_cell_in_view(rbf_c2grad_idx_view(4, jc, jb), jk,
                                  rbf_c2grad_blk_view(4, jc, jb)) +
-              rbf_c2grad_coeff_view(5, 2, jc, jb) *
+              rbf_c2grad_coeff_view(5, 1, jc, jb) *
                   p_cell_in_view(rbf_c2grad_idx_view(5, jc, jb), jk,
                                  rbf_c2grad_blk_view(5, jc, jb)) +
-              rbf_c2grad_coeff_view(6, 2, jc, jb) *
+              rbf_c2grad_coeff_view(6, 1, jc, jb) *
                   p_cell_in_view(rbf_c2grad_idx_view(6, jc, jb), jk,
                                  rbf_c2grad_blk_view(6, jc, jb)) +
-              rbf_c2grad_coeff_view(7, 2, jc, jb) *
+              rbf_c2grad_coeff_view(7, 1, jc, jb) *
                   p_cell_in_view(rbf_c2grad_idx_view(7, jc, jb), jk,
                                  rbf_c2grad_blk_view(7, jc, jb)) +
-              rbf_c2grad_coeff_view(8, 2, jc, jb) *
+              rbf_c2grad_coeff_view(8, 1, jc, jb) *
                   p_cell_in_view(rbf_c2grad_idx_view(8, jc, jb), jk,
                                  rbf_c2grad_blk_view(8, jc, jb)) +
-              rbf_c2grad_coeff_view(9, 2, jc, jb) *
+              rbf_c2grad_coeff_view(9, 1, jc, jb) *
                   p_cell_in_view(rbf_c2grad_idx_view(9, jc, jb), jk,
                                  rbf_c2grad_blk_view(9, jc, jb));
         });
@@ -270,10 +270,10 @@ void rbf_vec_interpol_cell_lib(const T *p_vn_in, const int *rbf_vec_idx_c,
                                          nblks_c);
   UnmanagedConstInt3D rbf_vec_blk_c_view(rbf_vec_blk_c, rbf_vec_dim_c, nproma,
                                          nblks_c);
-  UnmanagedConstT4D rbf_vec_coeff_c_view(rbf_vec_coeff_c, nproma,
-                                         nblks_c); // TODO
+  UnmanagedConstT4D rbf_vec_coeff_c_view(rbf_vec_coeff_c, rbf_vec_dim_c, 2, nproma,
+                                         nblks_c);
   UnmanagedT3D p_u_out_view(p_u_out, nproma, nlev, nblks_c);
-  UnmanagedT3D p_v_out_view(p_u_out, nproma, nlev, nblks_c);
+  UnmanagedT3D p_v_out_view(p_v_out, nproma, nlev, nblks_c);
 
   for (int jb = i_startblk; jb <= i_endblk; ++jb) {
 
@@ -288,60 +288,60 @@ void rbf_vec_interpol_cell_lib(const T *p_vn_in, const int *rbf_vec_idx_c,
         "rbf_vec_interpol_cell_lib", innerPolicy,
         KOKKOS_LAMBDA(const int jk, const int jc) {
           p_u_out_view(jc, jk, jb) =
-              rbf_vec_coeff_c_view(0, 1, jc, jb) *
+              rbf_vec_coeff_c_view(0, 0, jc, jb) *
                   p_vn_in_view(rbf_vec_idx_c_view(0, jc, jb), jk,
                                rbf_vec_blk_c_view(0, jc, jb)) +
-              rbf_vec_coeff_c_view(1, 1, jc, jb) *
+              rbf_vec_coeff_c_view(1, 0, jc, jb) *
                   p_vn_in_view(rbf_vec_idx_c_view(1, jc, jb), jk,
                                rbf_vec_blk_c_view(1, jc, jb)) +
-              rbf_vec_coeff_c_view(2, 1, jc, jb) *
+              rbf_vec_coeff_c_view(2, 0, jc, jb) *
                   p_vn_in_view(rbf_vec_idx_c_view(2, jc, jb), jk,
                                rbf_vec_blk_c_view(2, jc, jb)) +
-              rbf_vec_coeff_c_view(3, 1, jc, jb) *
+              rbf_vec_coeff_c_view(3, 0, jc, jb) *
                   p_vn_in_view(rbf_vec_idx_c_view(3, jc, jb), jk,
                                rbf_vec_blk_c_view(3, jc, jb)) +
-              rbf_vec_coeff_c_view(4, 1, jc, jb) *
+              rbf_vec_coeff_c_view(4, 0, jc, jb) *
                   p_vn_in_view(rbf_vec_idx_c_view(4, jc, jb), jk,
                                rbf_vec_blk_c_view(4, jc, jb)) +
-              rbf_vec_coeff_c_view(5, 1, jc, jb) *
+              rbf_vec_coeff_c_view(5, 0, jc, jb) *
                   p_vn_in_view(rbf_vec_idx_c_view(5, jc, jb), jk,
                                rbf_vec_blk_c_view(5, jc, jb)) +
-              rbf_vec_coeff_c_view(6, 1, jc, jb) *
+              rbf_vec_coeff_c_view(6, 0, jc, jb) *
                   p_vn_in_view(rbf_vec_idx_c_view(6, jc, jb), jk,
                                rbf_vec_blk_c_view(6, jc, jb)) +
-              rbf_vec_coeff_c_view(7, 1, jc, jb) *
+              rbf_vec_coeff_c_view(7, 0, jc, jb) *
                   p_vn_in_view(rbf_vec_idx_c_view(7, jc, jb), jk,
                                rbf_vec_blk_c_view(7, jc, jb)) +
-              rbf_vec_coeff_c_view(8, 1, jc, jb) *
+              rbf_vec_coeff_c_view(8, 0, jc, jb) *
                   p_vn_in_view(rbf_vec_idx_c_view(8, jc, jb), jk,
                                rbf_vec_blk_c_view(8, jc, jb));
 
           p_v_out_view(jc, jk, jb) =
-              rbf_vec_coeff_c_view(0, 2, jc, jb) *
+              rbf_vec_coeff_c_view(0, 1, jc, jb) *
                   p_vn_in_view(rbf_vec_idx_c_view(0, jc, jb), jk,
                                rbf_vec_blk_c_view(0, jc, jb)) +
-              rbf_vec_coeff_c_view(1, 2, jc, jb) *
+              rbf_vec_coeff_c_view(1, 1, jc, jb) *
                   p_vn_in_view(rbf_vec_idx_c_view(1, jc, jb), jk,
                                rbf_vec_blk_c_view(1, jc, jb)) +
-              rbf_vec_coeff_c_view(2, 2, jc, jb) *
+              rbf_vec_coeff_c_view(2, 1, jc, jb) *
                   p_vn_in_view(rbf_vec_idx_c_view(2, jc, jb), jk,
                                rbf_vec_blk_c_view(2, jc, jb)) +
-              rbf_vec_coeff_c_view(3, 2, jc, jb) *
+              rbf_vec_coeff_c_view(3, 1, jc, jb) *
                   p_vn_in_view(rbf_vec_idx_c_view(3, jc, jb), jk,
                                rbf_vec_blk_c_view(3, jc, jb)) +
-              rbf_vec_coeff_c_view(4, 2, jc, jb) *
+              rbf_vec_coeff_c_view(4, 1, jc, jb) *
                   p_vn_in_view(rbf_vec_idx_c_view(4, jc, jb), jk,
                                rbf_vec_blk_c_view(4, jc, jb)) +
-              rbf_vec_coeff_c_view(5, 2, jc, jb) *
+              rbf_vec_coeff_c_view(5, 1, jc, jb) *
                   p_vn_in_view(rbf_vec_idx_c_view(5, jc, jb), jk,
                                rbf_vec_blk_c_view(5, jc, jb)) +
-              rbf_vec_coeff_c_view(6, 2, jc, jb) *
+              rbf_vec_coeff_c_view(6, 1, jc, jb) *
                   p_vn_in_view(rbf_vec_idx_c_view(6, jc, jb), jk,
                                rbf_vec_blk_c_view(6, jc, jb)) +
-              rbf_vec_coeff_c_view(7, 2, jc, jb) *
+              rbf_vec_coeff_c_view(7, 1, jc, jb) *
                   p_vn_in_view(rbf_vec_idx_c_view(7, jc, jb), jk,
                                rbf_vec_blk_c_view(7, jc, jb)) +
-              rbf_vec_coeff_c_view(8, 2, jc, jb) *
+              rbf_vec_coeff_c_view(8, 1, jc, jb) *
                   p_vn_in_view(rbf_vec_idx_c_view(8, jc, jb), jk,
                                rbf_vec_blk_c_view(8, jc, jb));
         });
-- 
GitLab


From 677c30b83f322f46621c7907f93a14d10812df3b Mon Sep 17 00:00:00 2001
From: Pradipta Samanta <samanta@dkrz.de>
Date: Tue, 8 Apr 2025 18:29:11 +0200
Subject: [PATCH 20/34] made an overhaul of test_intp_rbf

---
 test/c/CMakeLists.txt    |    2 +-
 test/c/test_intp_rbf.cpp | 1271 ++++++++++++++++++++++++++++++++------
 2 files changed, 1080 insertions(+), 193 deletions(-)

diff --git a/test/c/CMakeLists.txt b/test/c/CMakeLists.txt
index e707970..175b226 100644
--- a/test/c/CMakeLists.txt
+++ b/test/c/CMakeLists.txt
@@ -34,7 +34,7 @@ set(SOURCES
   test_horizontal_rot.cpp
   # test_tdma_solver.cpp
   test_interpolation_vector.cpp
-  # test_intp_rbf.cpp
+  test_intp_rbf.cpp
   test_interpolation_scalar.cpp
 )
 # Create the test executable from your test files, including main.cpp.
diff --git a/test/c/test_intp_rbf.cpp b/test/c/test_intp_rbf.cpp
index 040d440..af72e65 100644
--- a/test/c/test_intp_rbf.cpp
+++ b/test/c/test_intp_rbf.cpp
@@ -15,15 +15,9 @@
 #include <gtest/gtest.h>
 #include <numeric>
 #include <vector>
-
-// Free-function helpers for 3D and 4D array sizes (assumed column-major)
-template <typename T> size_t num_elements_3d(int d1, int d2, int d3) {
-  return static_cast<size_t>(d1) * d2 * d3;
-}
-
-template <typename T> size_t num_elements_4d(int d1, int d2, int d3, int d4) {
-  return static_cast<size_t>(d1) * d2 * d3 * d4;
-}
+#include <random>
+#include <iostream>
+#include "dim_helper.hpp"
 
 // Define a helper struct that holds the two types.
 template <typename InT, typename OutT> struct MixedPrecision {
@@ -31,6 +25,9 @@ template <typename InT, typename OutT> struct MixedPrecision {
   using out_type = OutT;
 };
 
+// Define the list of types we want to test.
+typedef ::testing::Types<float, double> MyTypes;
+
 // Define the list of type pairs we want to test.
 typedef ::testing::Types<MixedPrecision<double, double>,
                          MixedPrecision<double, float>,
@@ -48,6 +45,7 @@ public:
   static constexpr int rbf_c2grad_dim = 10; // fixed dimension
   static constexpr int rbf_vec_dim_c = 9;
   static constexpr int rbf_vec_dim_e = 4;
+  static constexpr int rbf_vec_dim_v = 6;  // Fixed dimension for RBF
 
   // Parameter values.
   const int i_startblk = 0;
@@ -60,244 +58,1133 @@ public:
   const bool acc_async = false; // No asynchronous execution.
 };
 
-// Define a typed test fixture for the functions which have the same input and
-// output types
 template <typename T>
-class RbfInterpolTypedTestFixture : public ::testing::Test,
+class RbfInterpolSingleParamTest : public ::testing::Test,
                                     public interp_dimensions {
 public:
-  // Data arrays.
-  std::vector<T> p_cell_in;        // size: nproma * nlev * nblks_c
-  std::vector<int> rbf_c2grad_idx; // size: rbf_c2grad_dim * nproma * nblks_c
-  std::vector<int> rbf_c2grad_blk; // size: rbf_c2grad_dim * nproma * nblks_c
-  std::vector<int> rbf_vec_idx_c;  // size: rbf_vec_dim_c * nproma * nblks_c
-  std::vector<int> rbf_vec_blk_c;  // size: rbf_vec_dim_c * nproma * nblks_c
-  std::vector<T>
-      rbf_c2grad_coeff;  // size: rbf_c2grad_dim * 2 * nproma * nblks_c
-  std::vector<T> grad_x; // size: nproma * nlev * nblks_c
-  std::vector<T> grad_y; // size: nproma * nlev * nblks_c
-  std::vector<T> p_vn_in;
-  std::vector<T> rbf_vec_coeff_c;
-  std::vector<T> p_u_out;
-  std::vector<T> p_v_out;
-
-  std::vector<int> rbf_vec_idx_e;
-  std::vector<int> rbf_vec_blk_e;
-  std::vector<T> rbf_vec_coeff_e;
-  std::vector<T> p_vt_out;
-
-  RbfInterpolTypedTestFixture() {
-    size_t size3d = static_cast<size_t>(nproma) * nlev * nblks_c;
-    size_t size3d_idx = static_cast<size_t>(rbf_c2grad_dim) * nproma * nblks_c;
-    size_t size4d = static_cast<size_t>(rbf_c2grad_dim) * 2 * nproma * nblks_c;
-
-    size_t size3d_vec_dim =
-        static_cast<size_t>(rbf_vec_dim_c) * nproma * nblks_c;
-    size_t size_4d_vec_dim =
-        static_cast<size_t>(rbf_vec_dim_c) * 2 * nproma * nblks_c;
-
-    size_t size3d_edge_lib =
-        static_cast<size_t>(rbf_vec_dim_e) * nproma * nblks_c;
-    size_t size_4d_edge_lib =
-        static_cast<size_t>(rbf_vec_dim_e) * 2 * nproma * nblks_c;
-
-    p_cell_in.resize(size3d, static_cast<T>(1));
-    p_vn_in.resize(size3d, static_cast<T>(1));
-
-    rbf_vec_idx_c.resize(size3d_vec_dim, 1);
-    rbf_vec_blk_c.resize(size3d_vec_dim, 0);
-    rbf_c2grad_idx.resize(size3d_idx, 1);
-    rbf_c2grad_blk.resize(size3d_idx, 0); // Set block indices to 0 for testing.
-    rbf_vec_idx_e.resize(size3d_vec_dim, 1);
-    rbf_vec_blk_e.resize(size3d_vec_dim, 0);
-
-    rbf_vec_coeff_c.resize(size_4d_vec_dim, static_cast<T>(1));
-    rbf_c2grad_coeff.resize(size4d, static_cast<T>(1));
-    rbf_vec_coeff_e.resize(size_4d_edge_lib, static_cast<T>(1));
-
-    p_u_out.resize(size3d_vec_dim, static_cast<T>(0));
-    p_v_out.resize(size3d_vec_dim, static_cast<T>(0));
-    p_vt_out.resize(size3d_edge_lib, static_cast<T>(0));
-
-    grad_x.resize(size3d, static_cast<T>(0));
-    grad_y.resize(size3d, static_cast<T>(0));
-  }
+  // Using Kokkos execution and memory spaces
+  using exec_space = Kokkos::DefaultExecutionSpace;
+  using memory_space = exec_space::memory_space;
+  
+  // Kokkos Views for test data
+  Kokkos::View<T*, memory_space> p_cell_in;        // Dimensions: (nproma, nlev, nblks_c)
+  Kokkos::View<int*, memory_space> rbf_c2grad_idx; // Dimensions: (rbf_c2grad_dim, nproma, nblks_c)
+  Kokkos::View<int*, memory_space> rbf_c2grad_blk; // Dimensions: (rbf_c2grad_dim, nproma, nblks_c)
+  Kokkos::View<T*, memory_space> rbf_c2grad_coeff; // Dimensions: (rbf_c2grad_dim, 2, nproma, nblks_c)
+  Kokkos::View<T*, memory_space> grad_x;           // Dimensions: (nproma, nlev, nblks_c)
+  Kokkos::View<T*, memory_space> grad_y;           // Dimensions: (nproma, nlev, nblks_c)
+  
+  // Additional arrays for other functions
+  Kokkos::View<T*, memory_space> p_vn_in;
+  Kokkos::View<int*, memory_space> rbf_vec_idx_c;
+  Kokkos::View<int*, memory_space> rbf_vec_blk_c;
+  Kokkos::View<T*, memory_space> rbf_vec_coeff_c;
+  Kokkos::View<T*, memory_space> p_u_out;
+  Kokkos::View<T*, memory_space> p_v_out;
+  
+  Kokkos::View<int*, memory_space> rbf_vec_idx_e;
+  Kokkos::View<int*, memory_space> rbf_vec_blk_e;
+  Kokkos::View<T*, memory_space> rbf_vec_coeff_e;
+  Kokkos::View<T*, memory_space> p_vt_out;
+
+  RbfInterpolSingleParamTest()
+      : p_cell_in("p_cell_in", nproma * nlev * nblks_c),
+        rbf_c2grad_idx("rbf_c2grad_idx", rbf_c2grad_dim * nproma * nblks_c),
+        rbf_c2grad_blk("rbf_c2grad_blk", rbf_c2grad_dim * nproma * nblks_c),
+        rbf_c2grad_coeff("rbf_c2grad_coeff", rbf_c2grad_dim * 2 * nproma * nblks_c),
+        grad_x("grad_x", nproma * nlev * nblks_c),
+        grad_y("grad_y", nproma * nlev * nblks_c),
+        
+        p_vn_in("p_vn_in", nproma * nlev * nblks_c),
+        rbf_vec_idx_c("rbf_vec_idx_c", rbf_vec_dim_c * nproma * nblks_c),
+        rbf_vec_blk_c("rbf_vec_blk_c", rbf_vec_dim_c * nproma * nblks_c),
+        rbf_vec_coeff_c("rbf_vec_coeff_c", rbf_vec_dim_c * 2 * nproma * nblks_c),
+        p_u_out("p_u_out", nproma * nlev * nblks_c),
+        p_v_out("p_v_out", nproma * nlev * nblks_c),
+        
+        rbf_vec_idx_e("rbf_vec_idx_e", rbf_vec_dim_e * nproma * nblks_c),
+        rbf_vec_blk_e("rbf_vec_blk_e", rbf_vec_dim_e * nproma * nblks_c),
+        rbf_vec_coeff_e("rbf_vec_coeff_e", rbf_vec_dim_e * 2 * nproma * nblks_c),
+        p_vt_out("p_vt_out", nproma * nlev * nblks_c)
+  {}
 };
 
-typedef ::testing::Types<float, double> MyTypes;
+TYPED_TEST_SUITE(RbfInterpolSingleParamTest, MyTypes);
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// ! rbf_interpol_c2grad
+//
+////////////////////////////////////////////////////////////////////////////////
+
+TYPED_TEST(RbfInterpolSingleParamTest, C2GradSpecific) {
+  constexpr int nproma = this->nproma;
+  constexpr int nlev = this->nlev;
+  constexpr int nblks_c = this->nblks_c;
+  constexpr int rbf_c2grad_dim = this->rbf_c2grad_dim;
+
+  // Define indexing helpers
+  const auto &cell_at = at<nproma, nlev, nblks_c>;
+  const auto &idx_at = at<rbf_c2grad_dim, nproma, nblks_c>;
+  const auto &blk_at = at<rbf_c2grad_dim, nproma, nblks_c>;
+  const auto &coeff_at = at<rbf_c2grad_dim, 2, nproma, nblks_c>;
+  const auto &grad_at = at<nproma, nlev, nblks_c>;
+
+  // Create host mirror views
+  auto p_cell_in_h = Kokkos::create_mirror_view(this->p_cell_in);
+  auto rbf_c2grad_idx_h = Kokkos::create_mirror_view(this->rbf_c2grad_idx);
+  auto rbf_c2grad_blk_h = Kokkos::create_mirror_view(this->rbf_c2grad_blk);
+  auto rbf_c2grad_coeff_h = Kokkos::create_mirror_view(this->rbf_c2grad_coeff);
+  auto grad_x_h = Kokkos::create_mirror_view(this->grad_x);
+  auto grad_y_h = Kokkos::create_mirror_view(this->grad_y);
 
-TYPED_TEST_SUITE(RbfInterpolTypedTestFixture, MyTypes);
+  // Initialize with index-based pattern for cell data
+  for (int ib = 0; ib < nblks_c; ++ib) {
+    for (int ik = 0; ik < nlev; ++ik) {
+      for (int ic = 0; ic < nproma; ++ic) {
+        // Value depends on all three indices
+        p_cell_in_h[cell_at(ic, ik, ib)] = static_cast<TypeParam>(1.0 + ic + ik * 0.1 + ib * 0.01);
+      }
+    }
+  }
+
+  // Initialize connectivity indices with specific pattern
+  for (int ib = 0; ib < nblks_c; ++ib) {
+    for (int ic = 0; ic < nproma; ++ic) {
+      // First index points to self
+      rbf_c2grad_idx_h[idx_at(0, ic, ib)] = ic;
+      rbf_c2grad_blk_h[blk_at(0, ic, ib)] = ib;
+      
+      // Other indices follow a pattern
+      for (int j = 1; j < rbf_c2grad_dim; ++j) {
+        rbf_c2grad_idx_h[idx_at(j, ic, ib)] = (ic + j) % nproma;
+        rbf_c2grad_blk_h[blk_at(j, ic, ib)] = (ib + j % nblks_c) % nblks_c;
+      }
+      
+      // Coefficients for x and y gradients - use a simple pattern that depends on ib, ic and j
+      for (int j = 0; j < rbf_c2grad_dim; ++j) {
+        rbf_c2grad_coeff_h[coeff_at(j, 0, ic, ib)] = static_cast<TypeParam>(1.0 + ib + 0.1*ic + 0.01 * j); // x coefficient
+        rbf_c2grad_coeff_h[coeff_at(j, 1, ic, ib)] = static_cast<TypeParam>(2.0 + ib + 0.1*ic + 0.01 * j); // y coefficient
+      }
+    }
+  }
 
-TYPED_TEST(RbfInterpolTypedTestFixture, C2Grad) {
-  using T = TypeParam;
+  // Initialize gradients to zero
+  for (int ib = 0; ib < nblks_c; ++ib) {
+    for (int ik = 0; ik < nlev; ++ik) {
+      for (int ic = 0; ic < nproma; ++ic) {
+        grad_x_h[grad_at(ic, ik, ib)] = static_cast<TypeParam>(0.0);
+        grad_y_h[grad_at(ic, ik, ib)] = static_cast<TypeParam>(0.0);
+      }
+    }
+  }
+
+  // Copy to device
+  Kokkos::deep_copy(this->p_cell_in, p_cell_in_h);
+  Kokkos::deep_copy(this->rbf_c2grad_idx, rbf_c2grad_idx_h);
+  Kokkos::deep_copy(this->rbf_c2grad_blk, rbf_c2grad_blk_h);
+  Kokkos::deep_copy(this->rbf_c2grad_coeff, rbf_c2grad_coeff_h);
+  Kokkos::deep_copy(this->grad_x, grad_x_h);
+  Kokkos::deep_copy(this->grad_y, grad_y_h);
+
+  Kokkos::fence();
+
+  // Call the function
   rbf_interpol_c2grad_lib<TypeParam>(
       this->p_cell_in.data(), this->rbf_c2grad_idx.data(),
       this->rbf_c2grad_blk.data(), this->rbf_c2grad_coeff.data(),
       this->grad_x.data(), this->grad_y.data(), this->i_startblk,
       this->i_endblk, this->i_startidx_in, this->i_endidx_in, this->slev,
-      this->elev, this->nproma, this->rbf_c2grad_dim, this->nlev, this->nblks_c,
-      this->lacc);
-
-  // For each block from i_startblk to i_endblk-1, and for each (i, level)
-  // the kernel sums rbf_c2grad_dim contributions, each equal to 1.
-  // Therefore, we expect grad_x and grad_y to equal rbf_c2grad_dim (i.e., 10).
-  for (int jb = this->i_startblk; jb < this->i_endblk; ++jb) {
-    for (int jk = 0; jk < this->nlev; ++jk) {
-      for (int i = 0; i < this->nproma; ++i) {
-        size_t idx = i + static_cast<size_t>(jk) * this->nproma +
-                     static_cast<size_t>(jb) * this->nproma * this->nlev;
-        EXPECT_NEAR(this->grad_x[idx],
-                    static_cast<TypeParam>(this->rbf_c2grad_dim),
-                    static_cast<TypeParam>(1e-5))
-            << "grad_x failure at block " << jb << ", level " << jk
-            << ", index " << i;
-        EXPECT_NEAR(this->grad_y[idx],
-                    static_cast<TypeParam>(this->rbf_c2grad_dim),
-                    static_cast<TypeParam>(1e-5))
-            << "grad_y failure at block " << jb << ", level " << jk
-            << ", index " << i;
+      this->elev, nproma, rbf_c2grad_dim, nlev, nblks_c, this->lacc);
+
+  Kokkos::fence();
+
+  // Copy results back to host
+  Kokkos::deep_copy(grad_x_h, this->grad_x);
+  Kokkos::deep_copy(grad_y_h, this->grad_y);
+
+  // Expected results based on the specific test values
+  std::vector<TypeParam> expected_grad_x(24);
+  std::vector<TypeParam> expected_grad_y(24);
+  int idx = 0;
+  std::generate(expected_grad_x.begin(), expected_grad_x.end(), [&idx]() {
+    TypeParam values[] = {
+      19.9225, 22.9275, 26.2225, 20.9675, 24.0725, 27.4675,
+      22.0125, 25.2175, 28.7125, 23.0575, 26.3625, 29.9575,
+      38.972, 42.977, 47.272, 41.017, 45.122, 49.517,
+      43.062, 47.267, 51.762, 45.107, 49.412, 54.007
+    };
+    return values[idx++];
+  });
+
+  idx = 0;
+  std::generate(expected_grad_y.begin(), expected_grad_y.end(), [&idx]() {
+    TypeParam values[] = {
+      38.9725, 42.9775, 47.2725, 41.0175, 45.1225, 49.5175,
+      43.0625, 47.2675, 51.7625, 45.1075, 49.4125, 54.0075,
+      58.022, 63.027, 68.322, 61.067, 66.172, 71.567,
+      64.112, 69.317, 74.812, 67.157, 72.462, 78.057 
+    };
+    return values[idx++];
+  });
+
+  // Verify results
+  for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
+    int i_startidx, i_endidx;
+    get_indices_c_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb,
+                      this->i_startblk, this->i_endblk, i_startidx, i_endidx);
+    
+    for (int jk = this->slev; jk <= this->elev; ++jk) {
+      for (int jc = i_startidx; jc <= i_endidx; ++jc) {
+        EXPECT_NEAR(grad_x_h[grad_at(jc, jk, jb)], 
+                   expected_grad_x[grad_at(jc, jk, jb)],
+                   static_cast<TypeParam>(1e-5))
+            << "grad_x failure at block " << jb << ", level " << jk << ", index " << jc;
+        EXPECT_NEAR(grad_y_h[grad_at(jc, jk, jb)], 
+                   expected_grad_y[grad_at(jc, jk, jb)], 
+                   static_cast<TypeParam>(1e-5))
+            << "grad_y failure at block " << jb << ", level " << jk << ", index " << jc;
       }
     }
   }
 }
 
-TYPED_TEST(RbfInterpolTypedTestFixture, Cell) {
-  using T = TypeParam;
+TYPED_TEST(RbfInterpolSingleParamTest, C2GradRandom) {
+  constexpr int nproma = this->nproma;
+  constexpr int nlev = this->nlev;
+  constexpr int nblks_c = this->nblks_c;
+  constexpr int rbf_c2grad_dim = this->rbf_c2grad_dim;
+
+  // Define indexing helpers
+  const auto &cell_at = at<nproma, nlev, nblks_c>;
+  const auto &idx_at = at<rbf_c2grad_dim, nproma, nblks_c>;
+  const auto &blk_at = at<rbf_c2grad_dim, nproma, nblks_c>;
+  const auto &coeff_at = at<rbf_c2grad_dim, 2, nproma, nblks_c>;
+  const auto &grad_at = at<nproma, nlev, nblks_c>;
+
+  // Create host mirror views
+  auto p_cell_in_h = Kokkos::create_mirror_view(this->p_cell_in);
+  auto rbf_c2grad_idx_h = Kokkos::create_mirror_view(this->rbf_c2grad_idx);
+  auto rbf_c2grad_blk_h = Kokkos::create_mirror_view(this->rbf_c2grad_blk);
+  auto rbf_c2grad_coeff_h = Kokkos::create_mirror_view(this->rbf_c2grad_coeff);
+  auto grad_x_h = Kokkos::create_mirror_view(this->grad_x);
+  auto grad_y_h = Kokkos::create_mirror_view(this->grad_y);
+
+  // Use fixed seed for reproducibility
+  std::mt19937 gen(42);
+  std::uniform_int_distribution<int> cell_distrib(0, nproma - 1);
+  std::uniform_int_distribution<int> block_distrib(0, nblks_c - 1);
+  std::uniform_real_distribution<double> real_distrib(0.01, 1.0);
+  std::uniform_real_distribution<double> coeff_distrib(-0.2, 0.2);  // Allow negative coefficients for gradients
+
+  // Initialize with random values
+  for (int ib = 0; ib < nblks_c; ++ib) {
+    for (int ik = 0; ik < nlev; ++ik) {
+      for (int ic = 0; ic < nproma; ++ic) {
+        p_cell_in_h[cell_at(ic, ik, ib)] = static_cast<TypeParam>(real_distrib(gen));
+      }
+    }
+  }
+
+  // Initialize connectivity indices with random values
+  for (int ib = 0; ib < nblks_c; ++ib) {
+    for (int ic = 0; ic < nproma; ++ic) {
+      // First index points to self
+      rbf_c2grad_idx_h[idx_at(0, ic, ib)] = ic;
+      rbf_c2grad_blk_h[blk_at(0, ic, ib)] = ib;
+      
+      // Other indices randomized
+      for (int j = 1; j < rbf_c2grad_dim; ++j) {
+        rbf_c2grad_idx_h[idx_at(j, ic, ib)] = cell_distrib(gen);
+        rbf_c2grad_blk_h[blk_at(j, ic, ib)] = block_distrib(gen);
+      }
+      
+      // Random coefficients for gradient reconstruction
+      for (int j = 0; j < rbf_c2grad_dim; ++j) {
+        rbf_c2grad_coeff_h[coeff_at(j, 0, ic, ib)] = static_cast<TypeParam>(coeff_distrib(gen));  // x coefficient
+        rbf_c2grad_coeff_h[coeff_at(j, 1, ic, ib)] = static_cast<TypeParam>(coeff_distrib(gen));  // y coefficient
+      }
+    }
+  }
+
+  // Initialize gradients to zero
+  for (int ib = 0; ib < nblks_c; ++ib) {
+    for (int ik = 0; ik < nlev; ++ik) {
+      for (int ic = 0; ic < nproma; ++ic) {
+        grad_x_h[grad_at(ic, ik, ib)] = static_cast<TypeParam>(0.0);
+        grad_y_h[grad_at(ic, ik, ib)] = static_cast<TypeParam>(0.0);
+      }
+    }
+  }
+
+  // Copy to device
+  Kokkos::deep_copy(this->p_cell_in, p_cell_in_h);
+  Kokkos::deep_copy(this->rbf_c2grad_idx, rbf_c2grad_idx_h);
+  Kokkos::deep_copy(this->rbf_c2grad_blk, rbf_c2grad_blk_h);
+  Kokkos::deep_copy(this->rbf_c2grad_coeff, rbf_c2grad_coeff_h);
+  Kokkos::deep_copy(this->grad_x, grad_x_h);
+  Kokkos::deep_copy(this->grad_y, grad_y_h);
+
+  Kokkos::fence();
+
+  // Call the function
+  rbf_interpol_c2grad_lib<TypeParam>(
+      this->p_cell_in.data(), this->rbf_c2grad_idx.data(),
+      this->rbf_c2grad_blk.data(), this->rbf_c2grad_coeff.data(),
+      this->grad_x.data(), this->grad_y.data(), this->i_startblk,
+      this->i_endblk, this->i_startidx_in, this->i_endidx_in, this->slev,
+      this->elev, nproma, rbf_c2grad_dim, nlev, nblks_c, this->lacc);
+
+  Kokkos::fence();
 
-  rbf_vec_interpol_cell_lib<T>(
+  // Copy results back to host
+  Kokkos::deep_copy(grad_x_h, this->grad_x);
+  Kokkos::deep_copy(grad_y_h, this->grad_y);
+
+  // Calculate expected values
+  using host_space = Kokkos::HostSpace;
+  Kokkos::View<TypeParam***, host_space> expected_grad_x("expected_grad_x", nproma, nlev, nblks_c);
+  Kokkos::View<TypeParam***, host_space> expected_grad_y("expected_grad_y", nproma, nlev, nblks_c);
+
+  // Compute expected values
+  for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
+    int i_startidx, i_endidx;
+    get_indices_c_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb,
+                      this->i_startblk, this->i_endblk, i_startidx, i_endidx);
+    
+    for (int jk = this->slev; jk <= this->elev; ++jk) {
+      for (int jc = i_startidx; jc <= i_endidx; ++jc) {
+        expected_grad_x(jc, jk, jb) = static_cast<TypeParam>(0.0);
+        expected_grad_y(jc, jk, jb) = static_cast<TypeParam>(0.0);
+        
+        for (int j = 0; j < rbf_c2grad_dim; ++j) {
+          int cell_idx = rbf_c2grad_idx_h[idx_at(j, jc, jb)];
+          int cell_blk = rbf_c2grad_blk_h[blk_at(j, jc, jb)];
+          TypeParam coeff_x = rbf_c2grad_coeff_h[coeff_at(j, 0, jc, jb)];
+          TypeParam coeff_y = rbf_c2grad_coeff_h[coeff_at(j, 1, jc, jb)];
+          
+          expected_grad_x(jc, jk, jb) += 
+              coeff_x * p_cell_in_h[cell_at(cell_idx, jk, cell_blk)];
+          expected_grad_y(jc, jk, jb) += 
+              coeff_y * p_cell_in_h[cell_at(cell_idx, jk, cell_blk)];
+        }
+      }
+    }
+  }
+
+  // Verify results
+  for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
+    int i_startidx, i_endidx;
+    get_indices_c_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb,
+                      this->i_startblk, this->i_endblk, i_startidx, i_endidx);
+    
+    for (int jk = this->slev; jk <= this->elev; ++jk) {
+      for (int jc = i_startidx; jc <= i_endidx; ++jc) {
+        EXPECT_NEAR(grad_x_h[grad_at(jc, jk, jb)], 
+                   expected_grad_x(jc, jk, jb), 
+                   static_cast<TypeParam>(1e-5))
+            << "grad_x failure at block " << jb << ", level " << jk << ", index " << jc;
+        EXPECT_NEAR(grad_y_h[grad_at(jc, jk, jb)], 
+                   expected_grad_y(jc, jk, jb), 
+                   static_cast<TypeParam>(1e-5))
+            << "grad_y failure at block " << jb << ", level " << jk << ", index " << jc;
+      }
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// ! rbf_vec_interpol_cell
+//
+////////////////////////////////////////////////////////////////////////////////
+
+TYPED_TEST(RbfInterpolSingleParamTest, CellSpecific) {
+  constexpr int nproma = this->nproma;
+  constexpr int nlev = this->nlev;
+  constexpr int nblks_c = this->nblks_c;
+  constexpr int nblks_e = this->nblks_e;
+  constexpr int rbf_vec_dim_c = this->rbf_vec_dim_c;
+
+  // Define indexing helpers
+  const auto &edge_at = at<nproma, nlev, nblks_e>;
+  const auto &idx_at = at<rbf_vec_dim_c, nproma, nblks_c>;
+  const auto &blk_at = at<rbf_vec_dim_c, nproma, nblks_c>;
+  const auto &coeff_at = at<rbf_vec_dim_c, 2, nproma, nblks_c>;
+  const auto &cell_at = at<nproma, nlev, nblks_c>;
+
+  // Create host mirror views
+  auto p_vn_in_h = Kokkos::create_mirror_view(this->p_vn_in);
+  auto rbf_vec_idx_c_h = Kokkos::create_mirror_view(this->rbf_vec_idx_c);
+  auto rbf_vec_blk_c_h = Kokkos::create_mirror_view(this->rbf_vec_blk_c);
+  auto rbf_vec_coeff_c_h = Kokkos::create_mirror_view(this->rbf_vec_coeff_c);
+  auto p_u_out_h = Kokkos::create_mirror_view(this->p_u_out);
+  auto p_v_out_h = Kokkos::create_mirror_view(this->p_v_out);
+
+  // Initialize with index-based pattern for edge data
+  for (int ib = 0; ib < nblks_e; ++ib) {
+    for (int ik = 0; ik < nlev; ++ik) {
+      for (int ic = 0; ic < nproma; ++ic) {
+        // Value depends on all three indices
+        p_vn_in_h[edge_at(ic, ik, ib)] = static_cast<TypeParam>(1.0 + ic + ik * 0.1 + ib * 0.01);
+      }
+    }
+  }
+
+  // Initialize cell connectivity indices with specific pattern
+  for (int ib = 0; ib < nblks_c; ++ib) {
+    for (int ic = 0; ic < nproma; ++ic) {
+      // Each cell connects to rbf_vec_dim_c edges
+      for (int j = 0; j < rbf_vec_dim_c; ++j) {
+        // Edge indices with a pattern
+        rbf_vec_idx_c_h[idx_at(j, ic, ib)] = (ic + j) % nproma;
+        rbf_vec_blk_c_h[blk_at(j, ic, ib)] = (ib + j % nblks_e) % nblks_e;
+        
+        // Interpolation coefficients that depend on indices
+        rbf_vec_coeff_c_h[coeff_at(j, 0, ic, ib)] = static_cast<TypeParam>(1.0 + ib + 0.1*ic + 0.01 * j); // x coefficient
+        rbf_vec_coeff_c_h[coeff_at(j, 1, ic, ib)] = static_cast<TypeParam>(2.0 + ib + 0.1*ic + 0.01 * j); // y coefficient
+      }
+      
+      // Initialize output to zero
+      for (int ik = 0; ik < nlev; ++ik) {
+        p_u_out_h[cell_at(ic, ik, ib)] = static_cast<TypeParam>(0.0);
+        p_v_out_h[cell_at(ic, ik, ib)] = static_cast<TypeParam>(0.0);
+      }
+    }
+  }
+
+  // Copy to device
+  Kokkos::deep_copy(this->p_vn_in, p_vn_in_h);
+  Kokkos::deep_copy(this->rbf_vec_idx_c, rbf_vec_idx_c_h);
+  Kokkos::deep_copy(this->rbf_vec_blk_c, rbf_vec_blk_c_h);
+  Kokkos::deep_copy(this->rbf_vec_coeff_c, rbf_vec_coeff_c_h);
+  Kokkos::deep_copy(this->p_u_out, p_u_out_h);
+  Kokkos::deep_copy(this->p_v_out, p_v_out_h);
+
+  Kokkos::fence();
+
+  // Call the function
+  rbf_vec_interpol_cell_lib<TypeParam>(
       this->p_vn_in.data(), this->rbf_vec_idx_c.data(),
       this->rbf_vec_blk_c.data(), this->rbf_vec_coeff_c.data(),
       this->p_u_out.data(), this->p_v_out.data(), this->i_startblk,
       this->i_endblk, this->i_startidx_in, this->i_endidx_in, this->slev,
-      this->elev, this->nproma, this->rbf_c2grad_dim, this->nlev, this->nblks_c,
-      this->nblks_e, this->lacc, this->acc_async);
-
-  for (int jb = this->i_startblk; jb < this->i_endblk; ++jb) {
-    for (int jk = 0; jk < this->nlev; ++jk) {
-      for (int i = 0; i < this->nproma; ++i) {
-        size_t idx = i + static_cast<size_t>(jk) * this->nproma +
-                     static_cast<size_t>(jb) * this->nproma * this->nlev;
-        EXPECT_NEAR(this->p_u_out[idx], static_cast<T>(this->rbf_vec_dim_c),
-                    static_cast<T>(1e-5))
-            << "p_u_out failure at block " << jb << ", level " << jk
-            << ", index " << i;
+      this->elev, nproma, nlev, nblks_c, nblks_e, rbf_vec_dim_c,
+      this->lacc, this->acc_async);
+
+  Kokkos::fence();
+
+  // Copy results back to host
+  Kokkos::deep_copy(p_u_out_h, this->p_u_out);
+  Kokkos::deep_copy(p_v_out_h, this->p_v_out);
+
+  // Expected results based on the specific test values
+  std::vector<TypeParam> expected_u(24);
+  std::vector<TypeParam> expected_v(24);
+  int idx = 0;
+  std::generate(expected_u.begin(), expected_u.end(), [&idx]() {
+    TypeParam values[] = {
+      18.8216, 20.5356, 22.3396, 19.7576, 21.5616, 23.4556,
+      20.6936, 22.5876, 24.5716, 21.6296, 23.6136, 25.6876,
+      36.882, 38.597, 40.402, 38.718, 40.523, 42.418,
+      40.554, 42.449, 44.434, 42.39, 44.375, 46.45
+    };
+    return values[idx++];
+  });
+  idx = 0;
+  std::generate(expected_v.begin(), expected_v.end(), [&idx]() {
+    TypeParam values[] = {
+      36.8616, 38.5756, 40.3796, 38.6976, 40.5016, 42.3956,
+      40.5336, 42.4276, 44.4116, 42.3696, 44.3536, 46.4276,
+      54.932, 56.647, 58.452, 57.668, 59.473, 61.368,
+      60.404, 62.299, 64.284, 63.14, 65.125, 67.2
+    };
+    return values[idx++];
+  });
+
+  // Verify results
+  for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
+    int i_startidx, i_endidx;
+    get_indices_c_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb,
+                      this->i_startblk, this->i_endblk, i_startidx, i_endidx);
+    
+    for (int jk = this->slev; jk <= this->elev; ++jk) {
+      for (int jc = i_startidx; jc <= i_endidx; ++jc) {
+        EXPECT_NEAR(p_u_out_h[cell_at(jc, jk, jb)], 
+                   expected_u[cell_at(jc, jk, jb)],
+                   static_cast<TypeParam>(1e-5))
+            << "u failure at block " << jb << ", level " << jk << ", index " << jc;
+        
+        EXPECT_NEAR(p_v_out_h[cell_at(jc, jk, jb)], 
+                   expected_v[cell_at(jc, jk, jb)],
+                   static_cast<TypeParam>(1e-5))
+            << "v failure at block " << jb << ", level " << jk << ", index " << jc;
       }
     }
   }
 }
 
-TYPED_TEST(RbfInterpolTypedTestFixture, Edge) {
-  using T = TypeParam;
+TYPED_TEST(RbfInterpolSingleParamTest, CellRandom) {
+  constexpr int nproma = this->nproma;
+  constexpr int nlev = this->nlev;
+  constexpr int nblks_c = this->nblks_c;
+  constexpr int nblks_e = this->nblks_e;
+  constexpr int rbf_vec_dim_c = this->rbf_vec_dim_c;
+
+  // Define indexing helpers
+  const auto &edge_at = at<nproma, nlev, nblks_e>;
+  const auto &idx_at = at<rbf_vec_dim_c, nproma, nblks_c>;
+  const auto &blk_at = at<rbf_vec_dim_c, nproma, nblks_c>;
+  const auto &coeff_at = at<rbf_vec_dim_c, 2, nproma, nblks_c>;
+  const auto &cell_at = at<nproma, nlev, nblks_c>;
+
+  // Create host mirror views
+  auto p_vn_in_h = Kokkos::create_mirror_view(this->p_vn_in);
+  auto rbf_vec_idx_c_h = Kokkos::create_mirror_view(this->rbf_vec_idx_c);
+  auto rbf_vec_blk_c_h = Kokkos::create_mirror_view(this->rbf_vec_blk_c);
+  auto rbf_vec_coeff_c_h = Kokkos::create_mirror_view(this->rbf_vec_coeff_c);
+  auto p_u_out_h = Kokkos::create_mirror_view(this->p_u_out);
+  auto p_v_out_h = Kokkos::create_mirror_view(this->p_v_out);
+
+  // Use fixed seed for reproducibility
+  std::mt19937 gen(43);  // Different seed from other tests
+  std::uniform_int_distribution<int> edge_distrib(0, nproma - 1);
+  std::uniform_int_distribution<int> block_distrib(0, nblks_e - 1);
+  std::uniform_real_distribution<double> real_distrib(0.01, 1.0);
+  std::uniform_real_distribution<double> coeff_distrib(0.01, 0.2);
+
+  // Initialize with random values
+  for (int ib = 0; ib < nblks_e; ++ib) {
+    for (int ik = 0; ik < nlev; ++ik) {
+      for (int ic = 0; ic < nproma; ++ic) {
+        p_vn_in_h[edge_at(ic, ik, ib)] = static_cast<TypeParam>(real_distrib(gen));
+      }
+    }
+  }
+
+  // Initialize cell connectivity indices with random values
+  for (int ib = 0; ib < nblks_c; ++ib) {
+    for (int ic = 0; ic < nproma; ++ic) {
+      for (int j = 0; j < rbf_vec_dim_c; ++j) {
+        // Random edge indices and blocks
+        rbf_vec_idx_c_h[idx_at(j, ic, ib)] = edge_distrib(gen);
+        rbf_vec_blk_c_h[blk_at(j, ic, ib)] = block_distrib(gen);
+        // Random coefficients for interpolation
+        rbf_vec_coeff_c_h[coeff_at(j, 0, ic, ib)] = static_cast<TypeParam>(coeff_distrib(gen));
+        rbf_vec_coeff_c_h[coeff_at(j, 1, ic, ib)] = static_cast<TypeParam>(coeff_distrib(gen));
+      }
+      
+      // Initialize output to zero
+      for (int ik = 0; ik < nlev; ++ik) {
+        p_u_out_h[cell_at(ic, ik, ib)] = static_cast<TypeParam>(0.0);
+        p_v_out_h[cell_at(ic, ik, ib)] = static_cast<TypeParam>(0.0);
+      }
+    }
+  }
+
+  // Copy to device
+  Kokkos::deep_copy(this->p_vn_in, p_vn_in_h);
+  Kokkos::deep_copy(this->rbf_vec_idx_c, rbf_vec_idx_c_h);
+  Kokkos::deep_copy(this->rbf_vec_blk_c, rbf_vec_blk_c_h);
+  Kokkos::deep_copy(this->rbf_vec_coeff_c, rbf_vec_coeff_c_h);
+  Kokkos::deep_copy(this->p_u_out, p_u_out_h);
+  Kokkos::deep_copy(this->p_v_out, p_v_out_h);
+
+  Kokkos::fence();
+
+  // Call the function
+  rbf_vec_interpol_cell_lib<TypeParam>(
+      this->p_vn_in.data(), this->rbf_vec_idx_c.data(),
+      this->rbf_vec_blk_c.data(), this->rbf_vec_coeff_c.data(),
+      this->p_u_out.data(), this->p_v_out.data(), this->i_startblk,
+      this->i_endblk, this->i_startidx_in, this->i_endidx_in, this->slev,
+      this->elev, nproma, nlev, nblks_c, nblks_e, rbf_vec_dim_c,
+      this->lacc, this->acc_async);
+
+  Kokkos::fence();
+
+  // Copy results back to host
+  Kokkos::deep_copy(p_u_out_h, this->p_u_out);
+  Kokkos::deep_copy(p_v_out_h, this->p_v_out);
+
+  // Calculate expected values
+  using host_space = Kokkos::HostSpace;
+  Kokkos::View<TypeParam***, host_space> expected_u("expected_u", nproma, nlev, nblks_c);
+  Kokkos::View<TypeParam***, host_space> expected_v("expected_v", nproma, nlev, nblks_c);
+
+  // Compute expected values
+  for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
+    int i_startidx, i_endidx;
+    get_indices_c_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb,
+                      this->i_startblk, this->i_endblk, i_startidx, i_endidx);
+    
+    for (int jk = this->slev; jk <= this->elev; ++jk) {
+      for (int jc = i_startidx; jc <= i_endidx; ++jc) {
+        expected_u(jc, jk, jb) = static_cast<TypeParam>(0.0);
+        expected_v(jc, jk, jb) = static_cast<TypeParam>(0.0);
+        
+        for (int j = 0; j < rbf_vec_dim_c; ++j) {
+          int edge_idx = rbf_vec_idx_c_h[idx_at(j, jc, jb)];
+          int edge_blk = rbf_vec_blk_c_h[blk_at(j, jc, jb)];
+          TypeParam coeff_u = rbf_vec_coeff_c_h[coeff_at(j, 0, jc, jb)];
+          TypeParam coeff_v = rbf_vec_coeff_c_h[coeff_at(j, 1, jc, jb)];
+          
+          expected_u(jc, jk, jb) += 
+              coeff_u * p_vn_in_h[edge_at(edge_idx, jk, edge_blk)];
+          expected_v(jc, jk, jb) += 
+              coeff_v * p_vn_in_h[edge_at(edge_idx, jk, edge_blk)];
+        }
+      }
+    }
+  }
 
-  rbf_vec_interpol_edge_lib<T>(
+  // Verify results
+  for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
+    int i_startidx, i_endidx;
+    get_indices_c_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb,
+                      this->i_startblk, this->i_endblk, i_startidx, i_endidx);
+    
+    for (int jk = this->slev; jk <= this->elev; ++jk) {
+      for (int jc = i_startidx; jc <= i_endidx; ++jc) {
+        EXPECT_NEAR(p_u_out_h[cell_at(jc, jk, jb)], 
+                   expected_u(jc, jk, jb), 
+                   static_cast<TypeParam>(1e-5))
+            << "u failure at block " << jb << ", level " << jk << ", index " << jc;
+        
+        EXPECT_NEAR(p_v_out_h[cell_at(jc, jk, jb)], 
+                   expected_v(jc, jk, jb), 
+                   static_cast<TypeParam>(1e-5))
+            << "v failure at block " << jb << ", level " << jk << ", index " << jc;
+      }
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// ! rbf_vec_interpol_edge
+//
+////////////////////////////////////////////////////////////////////////////////
+
+TYPED_TEST(RbfInterpolSingleParamTest, EdgeSpecific) {
+  constexpr int nproma = this->nproma;
+  constexpr int nlev = this->nlev;
+  constexpr int nblks_e = this->nblks_e;
+  constexpr int rbf_vec_dim_e = this->rbf_vec_dim_e;
+
+  // Define indexing helpers
+  const auto &vn_at = at<nproma, nlev, nblks_e>;
+  const auto &idx_at = at<rbf_vec_dim_e, nproma, nblks_e>;
+  const auto &blk_at = at<rbf_vec_dim_e, nproma, nblks_e>;
+  const auto &coeff_at = at<rbf_vec_dim_e, nproma, nblks_e>;
+  const auto &vt_at = at<nproma, nlev, nblks_e>;
+
+  // Create host mirror views
+  auto p_vn_in_h = Kokkos::create_mirror_view(this->p_vn_in);
+  auto rbf_vec_idx_e_h = Kokkos::create_mirror_view(this->rbf_vec_idx_e);
+  auto rbf_vec_blk_e_h = Kokkos::create_mirror_view(this->rbf_vec_blk_e);
+  auto rbf_vec_coeff_e_h = Kokkos::create_mirror_view(this->rbf_vec_coeff_e);
+  auto p_vt_out_h = Kokkos::create_mirror_view(this->p_vt_out);
+
+  // Initialize with index-based pattern for edge data
+  for (int ib = 0; ib < nblks_e; ++ib) {
+    for (int ik = 0; ik < nlev; ++ik) {
+      for (int ic = 0; ic < nproma; ++ic) {
+        // Value depends on all three indices
+        p_vn_in_h[vn_at(ic, ik, ib)] = static_cast<TypeParam>(1.0 + ic + ik * 0.1 + ib * 0.01);
+      }
+    }
+  }
+
+  // Initialize edge connectivity indices with specific pattern
+  for (int ib = 0; ib < nblks_e; ++ib) {
+    for (int ic = 0; ic < nproma; ++ic) {
+      // Each edge uses rbf_vec_dim_e neighboring edges
+      for (int j = 0; j < rbf_vec_dim_e; ++j) {
+        // Edge indices with a pattern
+        rbf_vec_idx_e_h[idx_at(j, ic, ib)] = (ic + j) % nproma;
+        rbf_vec_blk_e_h[blk_at(j, ic, ib)] = (ib + j % nblks_e) % nblks_e;
+        
+        // Interpolation coefficients that depend on indices
+        rbf_vec_coeff_e_h[coeff_at(j, ic, ib)] = static_cast<TypeParam>(1.0 + ib + 0.1*ic + 0.01 * j); // coefficient
+      }
+      
+      // Initialize output to zero
+      for (int ik = 0; ik < nlev; ++ik) {
+        p_vt_out_h[vt_at(ic, ik, ib)] = static_cast<TypeParam>(0.0);
+      }
+    }
+  }
+
+  // Copy to device
+  Kokkos::deep_copy(this->p_vn_in, p_vn_in_h);
+  Kokkos::deep_copy(this->rbf_vec_idx_e, rbf_vec_idx_e_h);
+  Kokkos::deep_copy(this->rbf_vec_blk_e, rbf_vec_blk_e_h);
+  Kokkos::deep_copy(this->rbf_vec_coeff_e, rbf_vec_coeff_e_h);
+  Kokkos::deep_copy(this->p_vt_out, p_vt_out_h);
+
+  Kokkos::fence();
+
+  // Call the function
+  rbf_vec_interpol_edge_lib<TypeParam>(
+      this->p_vn_in.data(), this->rbf_vec_idx_e.data(),
+      this->rbf_vec_blk_e.data(), this->rbf_vec_coeff_e.data(),
+      this->p_vt_out.data(), this->i_startblk, this->i_endblk,
+      this->i_startidx_in, this->i_endidx_in, this->slev, this->elev,
+      nlev, nproma, rbf_vec_dim_e, nblks_e, this->lacc, this->acc_async);
+
+  Kokkos::fence();
+
+  // Copy results back to host
+  Kokkos::deep_copy(p_vt_out_h, this->p_vt_out);
+
+  // Expected results based on the specific test values
+  std::vector<TypeParam> expected_vt(24);
+  int idx = 0;
+  std::generate(expected_vt.begin(), expected_vt.end(), [&idx]() {
+    TypeParam values[] = {
+      7.1304, 8.9324, 10.9644, 7.5364, 9.3784, 11.4504,
+      7.9424, 9.8244, 11.9364, 8.3484, 10.2704, 12.4224,
+      14.1502, 16.9522, 19.9842, 14.9562, 17.7982, 20.8702,
+      15.7622, 18.6442, 21.7562, 16.5682, 19.4902, 22.6422,
+    };
+    return values[idx++];
+  });
+
+  // Verify results
+  for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
+    int i_startidx, i_endidx;
+    get_indices_c_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb,
+                      this->i_startblk, this->i_endblk, i_startidx, i_endidx);
+    
+    for (int jk = this->slev; jk <= this->elev; ++jk) {
+      for (int je = i_startidx; je <= i_endidx; ++je) {
+        EXPECT_NEAR(p_vt_out_h[vt_at(je, jk, jb)], 
+                   expected_vt[vt_at(je, jk, jb)],
+                   static_cast<TypeParam>(1e-5))
+            << "Tangential velocity failure at block " << jb << ", level " << jk << ", index " << je;
+      }
+    }
+  }
+}
+
+TYPED_TEST(RbfInterpolSingleParamTest, EdgeRandom) {
+  constexpr int nproma = this->nproma;
+  constexpr int nlev = this->nlev;
+  constexpr int nblks_e = this->nblks_e;
+  constexpr int rbf_vec_dim_e = this->rbf_vec_dim_e;
+
+  // Define indexing helpers
+  const auto &vn_at = at<nproma, nlev, nblks_e>;
+  const auto &idx_at = at<rbf_vec_dim_e, nproma, nblks_e>;
+  const auto &blk_at = at<rbf_vec_dim_e, nproma, nblks_e>;
+  const auto &coeff_at = at<rbf_vec_dim_e, nproma, nblks_e>;
+  const auto &vt_at = at<nproma, nlev, nblks_e>;
+
+  // Create host mirror views
+  auto p_vn_in_h = Kokkos::create_mirror_view(this->p_vn_in);
+  auto rbf_vec_idx_e_h = Kokkos::create_mirror_view(this->rbf_vec_idx_e);
+  auto rbf_vec_blk_e_h = Kokkos::create_mirror_view(this->rbf_vec_blk_e);
+  auto rbf_vec_coeff_e_h = Kokkos::create_mirror_view(this->rbf_vec_coeff_e);
+  auto p_vt_out_h = Kokkos::create_mirror_view(this->p_vt_out);
+
+  // Use fixed seed for reproducibility
+  std::mt19937 gen(44);  // Different seed from other tests
+  std::uniform_int_distribution<int> edge_distrib(0, nproma - 1);
+  std::uniform_int_distribution<int> block_distrib(0, nblks_e - 1);
+  std::uniform_real_distribution<double> real_distrib(0.01, 1.0);
+  std::uniform_real_distribution<double> coeff_distrib(0.01, 0.5);
+
+  // Initialize with random values
+  for (int ib = 0; ib < nblks_e; ++ib) {
+    for (int ik = 0; ik < nlev; ++ik) {
+      for (int ic = 0; ic < nproma; ++ic) {
+        p_vn_in_h[vn_at(ic, ik, ib)] = static_cast<TypeParam>(real_distrib(gen));
+      }
+    }
+  }
+
+  // Initialize edge connectivity indices with random values
+  for (int ib = 0; ib < nblks_e; ++ib) {
+    for (int ic = 0; ic < nproma; ++ic) {
+      
+      for (int j = 0; j < rbf_vec_dim_e; ++j) {
+        // Random edge indices and blocks
+        rbf_vec_idx_e_h[idx_at(j, ic, ib)] = edge_distrib(gen);
+        rbf_vec_blk_e_h[blk_at(j, ic, ib)] = block_distrib(gen);
+        // Random coefficients for interpolation
+        rbf_vec_coeff_e_h[coeff_at(j, ic, ib)] = static_cast<TypeParam>(coeff_distrib(gen));
+      }
+      
+      // Initialize output to zero
+      for (int ik = 0; ik < nlev; ++ik) {
+        p_vt_out_h[vt_at(ic, ik, ib)] = static_cast<TypeParam>(0.0);
+      }
+    }
+  }
+
+  // Copy to device
+  Kokkos::deep_copy(this->p_vn_in, p_vn_in_h);
+  Kokkos::deep_copy(this->rbf_vec_idx_e, rbf_vec_idx_e_h);
+  Kokkos::deep_copy(this->rbf_vec_blk_e, rbf_vec_blk_e_h);
+  Kokkos::deep_copy(this->rbf_vec_coeff_e, rbf_vec_coeff_e_h);
+  Kokkos::deep_copy(this->p_vt_out, p_vt_out_h);
+
+  Kokkos::fence();
+
+  // Call the function
+  rbf_vec_interpol_edge_lib<TypeParam>(
       this->p_vn_in.data(), this->rbf_vec_idx_e.data(),
       this->rbf_vec_blk_e.data(), this->rbf_vec_coeff_e.data(),
       this->p_vt_out.data(), this->i_startblk, this->i_endblk,
       this->i_startidx_in, this->i_endidx_in, this->slev, this->elev,
-      this->nlev, this->nproma, this->rbf_vec_dim_e, this->nblks_e, this->lacc,
-      this->acc_async);
-
-  for (int jb = this->i_startblk; jb < this->i_endblk; ++jb) {
-    for (int jk = 0; jk < this->nlev; ++jk) {
-      for (int i = 0; i < this->nproma; ++i) {
-        size_t idx = i + static_cast<size_t>(jk) * this->nproma +
-                     static_cast<size_t>(jb) * this->nproma * this->nlev;
-        EXPECT_NEAR(this->p_vt_out[idx], static_cast<T>(this->rbf_vec_dim_e),
-                    static_cast<T>(1e-5))
-            << "p_vt_out failure at block " << jb << ", level " << jk
-            << ", index " << i;
+      nlev, nproma, rbf_vec_dim_e, nblks_e, this->lacc, this->acc_async);
+
+  Kokkos::fence();
+
+  // Copy results back to host
+  Kokkos::deep_copy(p_vt_out_h, this->p_vt_out);
+
+  // Calculate expected values
+  using host_space = Kokkos::HostSpace;
+  Kokkos::View<TypeParam***, host_space> expected_vt("expected_vt", nproma, nlev, nblks_e);
+
+  // Compute expected values
+  for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
+    int i_startidx, i_endidx;
+    get_indices_c_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb,
+                      this->i_startblk, this->i_endblk, i_startidx, i_endidx);
+    
+    for (int jk = this->slev; jk <= this->elev; ++jk) {
+      for (int je = i_startidx; je <= i_endidx; ++je) {
+        expected_vt(je, jk, jb) = static_cast<TypeParam>(0.0);
+        
+        for (int j = 0; j < rbf_vec_dim_e; ++j) {
+          int edge_idx = rbf_vec_idx_e_h[idx_at(j, je, jb)];
+          int edge_blk = rbf_vec_blk_e_h[blk_at(j, je, jb)];
+          TypeParam coeff = rbf_vec_coeff_e_h[coeff_at(j, je, jb)];
+          
+          expected_vt(je, jk, jb) += 
+              coeff * p_vn_in_h[vn_at(edge_idx, jk, edge_blk)];
+        }
+      }
+    }
+  }
+
+  // Verify results
+  for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
+    int i_startidx, i_endidx;
+    get_indices_c_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb,
+                      this->i_startblk, this->i_endblk, i_startidx, i_endidx);
+    
+    for (int jk = this->slev; jk <= this->elev; ++jk) {
+      for (int je = i_startidx; je <= i_endidx; ++je) {
+        EXPECT_NEAR(p_vt_out_h[vt_at(je, jk, jb)], 
+                   expected_vt(je, jk, jb), 
+                   static_cast<TypeParam>(1e-5))
+            << "Tangential velocity failure at block " << jb << ", level " << jk << ", index " << je;
       }
     }
   }
 }
 
-// Define a typed test fixture for the functions which have different input and
-// output types
 template <typename TypePair>
-class RbfVecInterpolMixedTestFixture : public ::testing::Test,
+class RbfVecInterpolDoubleParamTest : public ::testing::Test,
                                        public interp_dimensions {
 public:
   using InType = typename TypePair::in_type;
   using OutType = typename TypePair::out_type;
 
-  // Constant dimensions.
-  static constexpr int nproma = 3;  // inner loop length
-  static constexpr int nlev = 4;    // number of vertical levels
-  static constexpr int nblks_e = 2; // number of edge blocks (for p_e_in)
-  static constexpr int nblks_v =
-      2; // number of vertex blocks (for rbf arrays and outputs)
-  static constexpr int rbf_vec_dim =
-      6; // fixed dimension for rbf vector (stencil points)
+  // Using Kokkos execution and memory spaces
+  using exec_space = Kokkos::DefaultExecutionSpace;
+  using memory_space = exec_space::memory_space;
+  
+  // Kokkos Views for test data
+  Kokkos::View<InType*, memory_space> p_e_in;     // Dimensions: (nproma, nlev, nblks_e)
+  Kokkos::View<int*, memory_space> rbf_vec_idx_v; // Dimensions: (rbf_vec_dim_v, nproma, nblks_v)
+  Kokkos::View<int*, memory_space> rbf_vec_blk_v; // Dimensions: (rbf_vec_dim_v, nproma, nblks_v)
+  Kokkos::View<InType*, memory_space> rbf_vec_coeff_v; // Dimensions: (rbf_vec_dim_v, 2, nproma, nblks_v)
+  Kokkos::View<OutType*, memory_space> p_u_out;   // Dimensions: (nproma, nlev, nblks_v)
+  Kokkos::View<OutType*, memory_space> p_v_out;   // Dimensions: (nproma, nlev, nblks_v)
 
-  // Parameter values.
-  int i_startblk = 0;
-  int i_endblk = 1; // Test blocks [0, 1]
-  int i_startidx_in = 0;
-  int i_endidx_in = nproma - 1; // Full range: 0 .. nproma-1
-  int slev = 0;
-  int elev = nlev - 1;    // Full vertical range (0 .. nlev-1)
-  bool lacc = false;      // Not using ACC-specific behavior.
-  bool acc_async = false; // No asynchronous execution.
-
-  // Arrays stored in std::vector.
-  std::vector<InType> p_e_in;     // Dimensions: (nproma, nlev, nblks_e)
-  std::vector<int> rbf_vec_idx_v; // Dimensions: (rbf_vec_dim, nproma, nblks_v)
-  std::vector<int> rbf_vec_blk_v; // Dimensions: (rbf_vec_dim, nproma, nblks_v)
-  std::vector<InType>
-      rbf_vec_coeff_v;          // Dimensions: (rbf_vec_dim, 2, nproma, nblks_v)
-  std::vector<OutType> p_u_out; // Dimensions: (nproma, nlev, nblks_v)
-  std::vector<OutType> p_v_out; // Dimensions: (nproma, nlev, nblks_v)
-
-  RbfVecInterpolMixedTestFixture() {
-    // Allocate and initialize inputs.
-    p_e_in.resize(num_elements_3d<InType>(nproma, nlev, nblks_e),
-                  static_cast<InType>(1));
-    rbf_vec_idx_v.resize(num_elements_3d<int>(rbf_vec_dim, nproma, nblks_v), 1);
-    rbf_vec_blk_v.resize(num_elements_3d<int>(rbf_vec_dim, nproma, nblks_v), 0);
-    rbf_vec_coeff_v.resize(
-        num_elements_4d<InType>(rbf_vec_dim, 2, nproma, nblks_v),
-        static_cast<InType>(1));
-
-    // Allocate output arrays and initialize to zero.
-    p_u_out.resize(num_elements_3d<OutType>(nproma, nlev, nblks_v),
-                   static_cast<OutType>(0));
-    p_v_out.resize(num_elements_3d<OutType>(nproma, nlev, nblks_v),
-                   static_cast<OutType>(0));
-  }
+  RbfVecInterpolDoubleParamTest()
+      : p_e_in("p_e_in", nproma * nlev * nblks_e),
+        rbf_vec_idx_v("rbf_vec_idx_v", rbf_vec_dim_v * nproma * nblks_v),
+        rbf_vec_blk_v("rbf_vec_blk_v", rbf_vec_dim_v * nproma * nblks_v),
+        rbf_vec_coeff_v("rbf_vec_coeff_v", rbf_vec_dim_v * 2 * nproma * nblks_v),
+        p_u_out("p_u_out", nproma * nlev * nblks_v),
+        p_v_out("p_v_out", nproma * nlev * nblks_v)
+  {}
 };
 
-TYPED_TEST_SUITE(RbfVecInterpolMixedTestFixture, MixedTypes);
+TYPED_TEST_SUITE(RbfVecInterpolDoubleParamTest, MixedTypes);
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// ! rbf_vec_interpol_vertex
+//
+////////////////////////////////////////////////////////////////////////////////
+
+TYPED_TEST(RbfVecInterpolDoubleParamTest, VertexSpecific) {
+  using InType = typename TestFixture::InType;
+  using OutType = typename TestFixture::OutType;
+  
+  constexpr int nproma = this->nproma;
+  constexpr int nlev = this->nlev;
+  constexpr int nblks_e = this->nblks_e;
+  constexpr int nblks_v = this->nblks_v;
+  constexpr int rbf_vec_dim_v = this->rbf_vec_dim_v;
+
+  // Define indexing helpers
+  const auto &edge_at = at<nproma, nlev, nblks_e>;
+  const auto &idx_at = at<rbf_vec_dim_v, nproma, nblks_v>;
+  const auto &blk_at = at<rbf_vec_dim_v, nproma, nblks_v>;
+  const auto &coeff_at = at<rbf_vec_dim_v, 2, nproma, nblks_v>;
+  const auto &vert_at = at<nproma, nlev, nblks_v>;
+
+  // Create host mirror views
+  auto p_e_in_h = Kokkos::create_mirror_view(this->p_e_in);
+  auto rbf_vec_idx_v_h = Kokkos::create_mirror_view(this->rbf_vec_idx_v);
+  auto rbf_vec_blk_v_h = Kokkos::create_mirror_view(this->rbf_vec_blk_v);
+  auto rbf_vec_coeff_v_h = Kokkos::create_mirror_view(this->rbf_vec_coeff_v);
+  auto p_u_out_h = Kokkos::create_mirror_view(this->p_u_out);
+  auto p_v_out_h = Kokkos::create_mirror_view(this->p_v_out);
+
+  // Initialize with index-based pattern for edge data
+  for (int ib = 0; ib < nblks_e; ++ib) {
+    for (int ik = 0; ik < nlev; ++ik) {
+      for (int ic = 0; ic < nproma; ++ic) {
+        // Value depends on all three indices
+        p_e_in_h[edge_at(ic, ik, ib)] = static_cast<InType>(1.0 + ic + ik * 0.1 + ib * 0.01);
+      }
+    }
+  }
+
+  // Initialize vertex connectivity indices with specific pattern
+  for (int ib = 0; ib < nblks_v; ++ib) {
+    for (int ic = 0; ic < nproma; ++ic) {
+      // Each vertex connects to 6 edges
+      for (int j = 0; j < rbf_vec_dim_v; ++j) {
+        // Edge indices with a pattern
+        rbf_vec_idx_v_h[idx_at(j, ic, ib)] = (ic + j) % nproma;
+        rbf_vec_blk_v_h[blk_at(j, ic, ib)] = (ib + j % nblks_e) % nblks_e;
+        
+        // Interpolation coefficients that depend on indices
+        rbf_vec_coeff_v_h[coeff_at(j, 0, ic, ib)] = static_cast<InType>(1.0 + ib + 0.1*ic + 0.01 * j); // x coefficient
+        rbf_vec_coeff_v_h[coeff_at(j, 1, ic, ib)] = static_cast<InType>(2.0 + ib + 0.1*ic + 0.01 * j); // y coefficient
+      }
+      
+      // Initialize output to zero
+      for (int ik = 0; ik < nlev; ++ik) {
+        p_u_out_h[vert_at(ic, ik, ib)] = static_cast<OutType>(0.0);
+        p_v_out_h[vert_at(ic, ik, ib)] = static_cast<OutType>(0.0);
+      }
+    }
+  }
+
+  // Copy to device
+  Kokkos::deep_copy(this->p_e_in, p_e_in_h);
+  Kokkos::deep_copy(this->rbf_vec_idx_v, rbf_vec_idx_v_h);
+  Kokkos::deep_copy(this->rbf_vec_blk_v, rbf_vec_blk_v_h);
+  Kokkos::deep_copy(this->rbf_vec_coeff_v, rbf_vec_coeff_v_h);
+  Kokkos::deep_copy(this->p_u_out, p_u_out_h);
+  Kokkos::deep_copy(this->p_v_out, p_v_out_h);
+
+  Kokkos::fence();
+
+  // Call the function
+  rbf_vec_interpol_vertex_lib<InType, OutType>(
+      this->p_e_in.data(), this->rbf_vec_idx_v.data(),
+      this->rbf_vec_blk_v.data(), this->rbf_vec_coeff_v.data(),
+      this->p_u_out.data(), this->p_v_out.data(), this->i_startblk,
+      this->i_endblk, this->i_startidx_in, this->i_endidx_in, this->slev,
+      this->elev, nproma, this->lacc, this->acc_async, nlev,
+      nblks_e, nblks_v);
+
+  Kokkos::fence();
 
-TYPED_TEST(RbfVecInterpolMixedTestFixture, Vertex) {
+  // Copy results back to host
+  Kokkos::deep_copy(p_u_out_h, this->p_u_out);
+  Kokkos::deep_copy(p_v_out_h, this->p_v_out);
+
+  // Expected results based on the specific test values
+  std::vector<OutType> expected_u(24);
+  std::vector<OutType> expected_v(24);
+  int idx = 0;
+  std::generate(expected_u.begin(), expected_u.end(), [&idx]() {
+    OutType values[] = {
+      12.3709, 13.5139, 14.7169, 12.9859, 14.1889, 15.4519,
+      13.6009, 14.8639, 16.1869, 14.2159, 15.5389, 16.9219,
+      24.4006, 25.5436, 26.7466, 25.6156, 26.8186, 28.0816,
+      26.8306, 28.0936, 29.4166, 28.0456, 29.3686, 30.7516
+    };
+    return values[idx++];
+  });
+  idx = 0;
+  std::generate(expected_v.begin(), expected_v.end(), [&idx]() {
+    OutType values[] = {
+      24.4009, 25.5439, 26.7469, 25.6159, 26.8189, 28.0819,
+      26.8309, 28.0939, 29.4169, 28.0459, 29.3689, 30.7519,
+      36.4306, 37.5736, 38.7766, 38.2456, 39.4486, 40.7116,
+      40.0606, 41.3236, 42.6466, 41.8756, 43.1986, 44.5816
+    };
+    return values[idx++];
+  });
+
+  // Verify results
+  for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
+    int i_startidx, i_endidx;
+    get_indices_v_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb,
+                      this->i_startblk, this->i_endblk, i_startidx, i_endidx);
+    
+    for (int jk = this->slev; jk <= this->elev; ++jk) {
+      for (int jv = i_startidx; jv <= i_endidx; ++jv) {
+        EXPECT_NEAR(p_u_out_h[vert_at(jv, jk, jb)], 
+                   expected_u[vert_at(jv, jk, jb)],
+                   static_cast<OutType>(1e-5))
+            << "u failure at block " << jb << ", level " << jk << ", index " << jv;
+        EXPECT_NEAR(p_v_out_h[vert_at(jv, jk, jb)], 
+                   expected_v[vert_at(jv, jk, jb)],
+                   static_cast<OutType>(1e-5))
+            << "v failure at block " << jb << ", level " << jk << ", index " << jv;
+      }
+    }
+  }
+}
+
+TYPED_TEST(RbfVecInterpolDoubleParamTest, VertexRandom) {
   using InType = typename TestFixture::InType;
   using OutType = typename TestFixture::OutType;
+  
+  constexpr int nproma = this->nproma;
+  constexpr int nlev = this->nlev;
+  constexpr int nblks_e = this->nblks_e;
+  constexpr int nblks_v = this->nblks_v;
+  constexpr int rbf_vec_dim_v = 6;  // Fixed dimension for RBF
+
+  // Define indexing helpers
+  const auto &edge_at = at<nproma, nlev, nblks_e>;
+  const auto &idx_at = at<rbf_vec_dim_v, nproma, nblks_v>;
+  const auto &blk_at = at<rbf_vec_dim_v, nproma, nblks_v>;
+  const auto &coeff_at = at<rbf_vec_dim_v, 2, nproma, nblks_v>;
+  const auto &vert_at = at<nproma, nlev, nblks_v>;
 
-  // Call the function with mixed precision.
+  // Create host mirror views
+  auto p_e_in_h = Kokkos::create_mirror_view(this->p_e_in);
+  auto rbf_vec_idx_v_h = Kokkos::create_mirror_view(this->rbf_vec_idx_v);
+  auto rbf_vec_blk_v_h = Kokkos::create_mirror_view(this->rbf_vec_blk_v);
+  auto rbf_vec_coeff_v_h = Kokkos::create_mirror_view(this->rbf_vec_coeff_v);
+  auto p_u_out_h = Kokkos::create_mirror_view(this->p_u_out);
+  auto p_v_out_h = Kokkos::create_mirror_view(this->p_v_out);
+
+  // Use fixed seed for reproducibility
+  std::mt19937 gen(42);
+  std::uniform_int_distribution<int> edge_distrib(0, nproma - 1);
+  std::uniform_int_distribution<int> block_distrib(0, nblks_e - 1);
+  std::uniform_real_distribution<double> real_distrib(0.01, 1.0);
+
+  // Initialize with random values
+  for (int ib = 0; ib < nblks_e; ++ib) {
+    for (int ik = 0; ik < nlev; ++ik) {
+      for (int ic = 0; ic < nproma; ++ic) {
+        p_e_in_h[edge_at(ic, ik, ib)] = static_cast<InType>(real_distrib(gen));
+      }
+    }
+  }
+
+  // Initialize vertex connectivity indices with random values
+  for (int ib = 0; ib < nblks_v; ++ib) {
+    for (int ic = 0; ic < nproma; ++ic) {
+      
+      for (int j = 0; j < rbf_vec_dim_v; ++j) {
+        // Random edge indices and blocks
+        rbf_vec_idx_v_h[idx_at(j, ic, ib)] = edge_distrib(gen);
+        rbf_vec_blk_v_h[blk_at(j, ic, ib)] = block_distrib(gen);
+        // Random coefficients for interpolation
+        rbf_vec_coeff_v_h[coeff_at(j, 0, ic, ib)] = static_cast<InType>(real_distrib(gen));
+        rbf_vec_coeff_v_h[coeff_at(j, 1, ic, ib)] = static_cast<InType>(real_distrib(gen));
+      }
+      
+      // Initialize output to zero
+      for (int ik = 0; ik < nlev; ++ik) {
+        p_u_out_h[vert_at(ic, ik, ib)] = static_cast<OutType>(0.0);
+        p_v_out_h[vert_at(ic, ik, ib)] = static_cast<OutType>(0.0);
+      }
+    }
+  }
+
+  // Copy to device
+  Kokkos::deep_copy(this->p_e_in, p_e_in_h);
+  Kokkos::deep_copy(this->rbf_vec_idx_v, rbf_vec_idx_v_h);
+  Kokkos::deep_copy(this->rbf_vec_blk_v, rbf_vec_blk_v_h);
+  Kokkos::deep_copy(this->rbf_vec_coeff_v, rbf_vec_coeff_v_h);
+  Kokkos::deep_copy(this->p_u_out, p_u_out_h);
+  Kokkos::deep_copy(this->p_v_out, p_v_out_h);
+
+  Kokkos::fence();
+
+  // Call the function
   rbf_vec_interpol_vertex_lib<InType, OutType>(
       this->p_e_in.data(), this->rbf_vec_idx_v.data(),
       this->rbf_vec_blk_v.data(), this->rbf_vec_coeff_v.data(),
       this->p_u_out.data(), this->p_v_out.data(), this->i_startblk,
       this->i_endblk, this->i_startidx_in, this->i_endidx_in, this->slev,
-      this->elev, this->nproma, this->lacc, this->acc_async, this->nlev,
-      this->nblks_e, this->nblks_v);
-
-  // Check the outputs only for blocks in the range [i_startblk, i_endblk].
-  for (int block = this->i_startblk; block <= this->i_endblk; ++block) {
-    for (int level = 0; level < this->nlev; ++level) {
-      for (int i = 0; i < this->nproma; ++i) {
-        // Compute the linear index for a 3D array in column-major order:
-        size_t idx =
-            i + level * this->nproma + block * this->nproma * this->nlev;
-        // Since every contribution is 1 and there are 6 stencil points,
-        // expect 6.
-        EXPECT_NEAR(this->p_u_out[idx], static_cast<OutType>(6),
-                    static_cast<OutType>(1e-5))
-            << "Failure at block " << block << ", level " << level << ", index "
-            << i;
-        EXPECT_NEAR(this->p_v_out[idx], static_cast<OutType>(6),
-                    static_cast<OutType>(1e-5))
-            << "Failure at block " << block << ", level " << level << ", index "
-            << i;
+      this->elev, nproma, this->lacc, this->acc_async, nlev,
+      nblks_e, nblks_v);
+
+  Kokkos::fence();
+
+  // Copy results back to host
+  Kokkos::deep_copy(p_u_out_h, this->p_u_out);
+  Kokkos::deep_copy(p_v_out_h, this->p_v_out);
+
+  // Calculate expected values
+  using host_space = Kokkos::HostSpace;
+  Kokkos::View<OutType***, host_space> expected_u("expected_u", nproma, nlev, nblks_v);
+  Kokkos::View<OutType***, host_space> expected_v("expected_v", nproma, nlev, nblks_v);
+
+  // Compute expected values
+  for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
+    int i_startidx, i_endidx;
+    get_indices_v_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb,
+                      this->i_startblk, this->i_endblk, i_startidx, i_endidx);
+    
+    for (int jk = this->slev; jk <= this->elev; ++jk) {
+      for (int jv = i_startidx; jv <= i_endidx; ++jv) {
+        expected_u(jv, jk, jb) = static_cast<OutType>(0.0);
+        expected_v(jv, jk, jb) = static_cast<OutType>(0.0);
+        
+        for (int j = 0; j < rbf_vec_dim_v; ++j) {
+          int edge_idx = rbf_vec_idx_v_h[idx_at(j, jv, jb)];
+          int edge_blk = rbf_vec_blk_v_h[blk_at(j, jv, jb)];
+          InType coeff_u = rbf_vec_coeff_v_h[coeff_at(j, 0, jv, jb)];
+          InType coeff_v = rbf_vec_coeff_v_h[coeff_at(j, 1, jv, jb)];
+          
+          expected_u(jv, jk, jb) += 
+              static_cast<OutType>(coeff_u * p_e_in_h[edge_at(edge_idx, jk, edge_blk)]);
+          expected_v(jv, jk, jb) += 
+              static_cast<OutType>(coeff_v * p_e_in_h[edge_at(edge_idx, jk, edge_blk)]);
+        }
+      }
+    }
+  }
+
+  // Verify results
+  for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
+    int i_startidx, i_endidx;
+    get_indices_v_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb,
+                      this->i_startblk, this->i_endblk, i_startidx, i_endidx);
+    
+    for (int jk = this->slev; jk <= this->elev; ++jk) {
+      for (int jv = i_startidx; jv <= i_endidx; ++jv) {
+        EXPECT_NEAR(p_u_out_h[vert_at(jv, jk, jb)], 
+                   expected_u(jv, jk, jb), 
+                   static_cast<OutType>(1e-5))
+            << "u failure at block " << jb << ", level " << jk << ", index " << jv;
+        EXPECT_NEAR(p_v_out_h[vert_at(jv, jk, jb)], 
+                   expected_v(jv, jk, jb), 
+                   static_cast<OutType>(1e-5))
+            << "v failure at block " << jb << ", level " << jk << ", index " << jv;
       }
     }
   }
-- 
GitLab


From a116368e03fc0ce3d5025bbfa3c34c17f08588d3 Mon Sep 17 00:00:00 2001
From: Pradipta Samanta <samanta@dkrz.de>
Date: Wed, 9 Apr 2025 11:00:04 +0200
Subject: [PATCH 21/34] modified test_tdma_solver to enable the use them on GPU

---
 test/c/CMakeLists.txt       |   2 +-
 test/c/test_tdma_solver.cpp | 349 +++++++++++++++++++++++++++++++-----
 2 files changed, 308 insertions(+), 43 deletions(-)

diff --git a/test/c/CMakeLists.txt b/test/c/CMakeLists.txt
index 175b226..c93e30a 100644
--- a/test/c/CMakeLists.txt
+++ b/test/c/CMakeLists.txt
@@ -32,7 +32,7 @@ set(SOURCES
   test_horizontal_div.cpp
   test_horizontal_recon.cpp
   test_horizontal_rot.cpp
-  # test_tdma_solver.cpp
+  test_tdma_solver.cpp
   test_interpolation_vector.cpp
   test_intp_rbf.cpp
   test_interpolation_scalar.cpp
diff --git a/test/c/test_tdma_solver.cpp b/test/c/test_tdma_solver.cpp
index 4e09ff3..3a0165a 100644
--- a/test/c/test_tdma_solver.cpp
+++ b/test/c/test_tdma_solver.cpp
@@ -13,76 +13,341 @@
 #include <vector>
 #include <algorithm>
 #include "mo_math_utilities.hpp"
+#include "dim_helper.hpp"
+#include <Kokkos_Core.hpp>
+#include <random>
 
-// Helper function to compute the 1D index for column-major storage.
+// Helper function for column-major indexing
+template <typename T>
 inline int idx(int i, int j, int nrows) {
   return i + j * nrows;
 }
 
-// Test fixture for the TDMA solver tests.
-class TDMASolverTestFixture : public ::testing::Test {
+template <typename T>
+class TDMASolverTypedTestFixture : public ::testing::Test {
 protected:
-  const int n = 10;             // Matrix dimension.
-  std::vector<double> a;        // Input matrix a.
-  std::vector<double> b;        // Input matrix b.
-  std::vector<double> c;        // Input matrix c.
-  std::vector<double> d;        // Input matrix d.
-  std::vector<double> x;        // Output matrix.
-
-  TDMASolverTestFixture()
-      : a(n * n), b(n * n), c(n * n), d(n * n), x(n * n, 0.0) {}
-
-  // SetUp is run before each test.
-  void SetUp() override {
-    // Fill arrays in column-major order.
+  const int n = 10;                 // Matrix dimension.
+
+  // Using Kokkos execution and memory spaces
+  using exec_space = Kokkos::DefaultExecutionSpace;
+  using memory_space = exec_space::memory_space;
+
+  // Kokkos Views for test data
+  Kokkos::View<T*, memory_space> a;      // Input matrix a.
+  Kokkos::View<T*, memory_space> b;      // Input matrix b.
+  Kokkos::View<T*, memory_space> c;      // Input matrix c.
+  Kokkos::View<T*, memory_space> d;      // Input matrix d.
+  Kokkos::View<T*, memory_space> x;      // Output matrix.
+
+  TDMASolverTypedTestFixture()
+      : a("a", n * n),
+        b("b", n * n),
+        c("c", n * n),
+        d("d", n * n),
+        x("x", n * n)
+  {}
+
+  void SetUpSpecificTest() {
+    // Create host mirror views
+    auto a_h = Kokkos::create_mirror_view(a);
+    auto b_h = Kokkos::create_mirror_view(b);
+    auto c_h = Kokkos::create_mirror_view(c);
+    auto d_h = Kokkos::create_mirror_view(d);
+    auto x_h = Kokkos::create_mirror_view(x);
+
+    // Fill arrays in column-major order with the specific test values
     for (int j = 0; j < n; j++) {
       for (int i = 0; i < n; i++) {
-        double value = (i + 1) + (j + 1);
-        a[idx(i, j, n)] = 1.0 * value;
-        b[idx(i, j, n)] = 2.0 * value;
-        c[idx(i, j, n)] = 1.0 * value;
-        d[idx(i, j, n)] = 1.0 * value;
+        T value = static_cast<T>((i + 1) + (j + 1));
+        a_h[idx<T>(i, j, n)] = static_cast<T>(1.0) * value;
+        b_h[idx<T>(i, j, n)] = static_cast<T>(2.0) * value;
+        c_h[idx<T>(i, j, n)] = static_cast<T>(1.0) * value;
+        d_h[idx<T>(i, j, n)] = static_cast<T>(1.0) * value;
+        x_h[idx<T>(i, j, n)] = static_cast<T>(0.0);
       }
     }
-    // Clear the output vector.
-    std::fill(x.begin(), x.end(), 0.0);
+
+    // Copy to device
+    Kokkos::deep_copy(a, a_h);
+    Kokkos::deep_copy(b, b_h);
+    Kokkos::deep_copy(c, c_h);
+    Kokkos::deep_copy(d, d_h);
+    Kokkos::deep_copy(x, x_h);
   }
 };
 
-TEST_F(TDMASolverTestFixture, FullTest) {
-  // Call the solver over the full range:
-  tdma_solver_vec<double>(a.data(), b.data(), c.data(), d.data(),
-                         0, n, 0, n, n, n, x.data());
+// Define the types we want to test with
+typedef ::testing::Types<float, double> NumericTypes;
+TYPED_TEST_SUITE(TDMASolverTypedTestFixture, NumericTypes);
+
+// Specific test for the full matrix
+TYPED_TEST(TDMASolverTypedTestFixture, SpecificFull) {
+  const int n = this->n;
+
+  // Set up the test with specific values
+  this->SetUpSpecificTest();
+
+  // Call the solver over the full range
+  tdma_solver_vec<TypeParam>(
+      this->a.data(), this->b.data(), this->c.data(), this->d.data(),
+      0, n, 0, n, n, n, this->x.data());
+
+  // Copy results back to host
+  auto x_h = Kokkos::create_mirror_view(this->x);
+  Kokkos::deep_copy(x_h, this->x);
 
-  // Compute the sum of all elements in the output matrix.
-  double sum = 0.0;
+  // Compute the sum of all elements in the output matrix
+  TypeParam sum = 0.0;
   for (int j = 0; j < n; j++) {
     for (int i = 0; i < n; i++) {
-      sum += x[idx(i, j, n)];
+      sum += x_h[idx<TypeParam>(i, j, n)];
     }
   }
 
-  // Expected reference sum
-  double sum_ref = 27.2727272727272769;
-  double tol = 1e-13;
+  // Expected reference sum (adjusted for precision)
+  TypeParam sum_ref = static_cast<TypeParam>(27.2727272727272769);
+  TypeParam tol = std::is_same<TypeParam, float>::value ?
+                 static_cast<TypeParam>(1e-6) : static_cast<TypeParam>(1e-13);
+
   EXPECT_NEAR(sum, sum_ref, tol);
 }
 
-TEST_F(TDMASolverTestFixture, PartialTest) {
-  // Call the solver for a partial region:
-  // For C++: slev = 1, elev = n-1, startidx = 1, endidx = n-1.
-  tdma_solver_vec<double>(a.data(), b.data(), c.data(), d.data(),
-                         1, n - 1, 1, n - 1, n, n, x.data());
+// Specific test for a partial region
+TYPED_TEST(TDMASolverTypedTestFixture, SpecificPartial) {
+  const int n = this->n;
+
+  // Set up the test with specific values
+  this->SetUpSpecificTest();
+
+  // Call the solver for a partial region
+  tdma_solver_vec<TypeParam>(
+      this->a.data(), this->b.data(), this->c.data(), this->d.data(),
+      1, n - 1, 1, n - 1, n, n, this->x.data());
+
+  // Copy results back to host
+  auto x_h = Kokkos::create_mirror_view(this->x);
+  Kokkos::deep_copy(x_h, this->x);
 
   // Compute the sum over a region
-  double sum = 0.0;
+  TypeParam sum = 0.0;
   for (int j = 1; j < n - 1; j++) {
     for (int i = 1; i < n - 1; i++) {
-      sum += x[idx(i, j, n)];
+      sum += x_h[idx<TypeParam>(i, j, n)];
     }
   }
 
-  double sum_ref = 17.7777777777777679;
-  double tol = 1e-13;
+  // Expected reference sum (adjusted for precision)
+  TypeParam sum_ref = static_cast<TypeParam>(17.7777777777777679);
+  TypeParam tol = std::is_same<TypeParam, float>::value ?
+                 static_cast<TypeParam>(1e-6) : static_cast<TypeParam>(1e-13);
+
   EXPECT_NEAR(sum, sum_ref, tol);
 }
+
+// Random test for the full matrix
+TYPED_TEST(TDMASolverTypedTestFixture, RandomFull) {
+  const int n = this->n;
+
+  // Create host mirror views
+  auto a_h = Kokkos::create_mirror_view(this->a);
+  auto b_h = Kokkos::create_mirror_view(this->b);
+  auto c_h = Kokkos::create_mirror_view(this->c);
+  auto d_h = Kokkos::create_mirror_view(this->d);
+  auto x_h = Kokkos::create_mirror_view(this->x);
+
+  // Use fixed seed for reproducibility
+  std::mt19937 gen(42);
+  std::uniform_real_distribution<double> diag_dist(5.0, 10.0);     // For main diagonal
+  std::uniform_real_distribution<double> off_diag_dist(0.1, 2.0);  // For off-diagonals
+  std::uniform_real_distribution<double> rhs_dist(-10.0, 10.0);    // For right-hand side
+
+  // Fill arrays with random values
+  for (int j = 0; j < n; j++) {
+    for (int i = 0; i < n; i++) {
+      a_h[idx<TypeParam>(i, j, n)] = static_cast<TypeParam>(-off_diag_dist(gen));
+      b_h[idx<TypeParam>(i, j, n)] = static_cast<TypeParam>(diag_dist(gen));
+      c_h[idx<TypeParam>(i, j, n)] = static_cast<TypeParam>(-off_diag_dist(gen));
+      d_h[idx<TypeParam>(i, j, n)] = static_cast<TypeParam>(rhs_dist(gen));
+      x_h[idx<TypeParam>(i, j, n)] = static_cast<TypeParam>(0.0);
+    }
+  }
+
+  // Save copies for reference solution
+  std::vector<TypeParam> a_copy(n * n);
+  std::vector<TypeParam> b_copy(n * n);
+  std::vector<TypeParam> c_copy(n * n);
+  std::vector<TypeParam> d_copy(n * n);
+  std::vector<TypeParam> x_expected(n * n, 0.0);
+
+  for (int j = 0; j < n; j++) {
+    for (int i = 0; i < n; i++) {
+      a_copy[idx<TypeParam>(i, j, n)] = a_h[idx<TypeParam>(i, j, n)];
+      b_copy[idx<TypeParam>(i, j, n)] = b_h[idx<TypeParam>(i, j, n)];
+      c_copy[idx<TypeParam>(i, j, n)] = c_h[idx<TypeParam>(i, j, n)];
+      d_copy[idx<TypeParam>(i, j, n)] = d_h[idx<TypeParam>(i, j, n)];
+    }
+  }
+
+  // Copy to device
+  Kokkos::deep_copy(this->a, a_h);
+  Kokkos::deep_copy(this->b, b_h);
+  Kokkos::deep_copy(this->c, c_h);
+  Kokkos::deep_copy(this->d, d_h);
+  Kokkos::deep_copy(this->x, x_h);
+
+  // Call the solver over the full range
+  tdma_solver_vec<TypeParam>(
+      this->a.data(), this->b.data(), this->c.data(), this->d.data(),
+      0, n, 0, n, n, n, this->x.data());
+
+  // Copy results back to host
+  Kokkos::deep_copy(x_h, this->x);
+
+  // Calculate reference solution
+  for (int i = 0; i < n; i++) {
+    // Arrays for internal calculations
+    std::vector<TypeParam> c_prime(n, 0.0);
+    std::vector<TypeParam> d_prime(n, 0.0);
+
+    // Forward sweep
+    c_prime[0] = c_copy[idx<TypeParam>(i, 0, n)] / b_copy[idx<TypeParam>(i, 0, n)];
+    d_prime[0] = d_copy[idx<TypeParam>(i, 0, n)] / b_copy[idx<TypeParam>(i, 0, n)];
+
+    for (int j = 1; j < n; j++) {
+      TypeParam m = static_cast<TypeParam>(1.0) /
+                   (b_copy[idx<TypeParam>(i, j, n)] - c_prime[j-1] * a_copy[idx<TypeParam>(i, j, n)]);
+      c_prime[j] = c_copy[idx<TypeParam>(i, j, n)] * m;
+      d_prime[j] = (d_copy[idx<TypeParam>(i, j, n)] - d_prime[j-1] * a_copy[idx<TypeParam>(i, j, n)]) * m;
+    }
+
+    // Back substitution
+    x_expected[idx<TypeParam>(i, n-1, n)] = d_prime[n-1];
+
+    for (int j = n-2; j >= 0; j--) {
+      x_expected[idx<TypeParam>(i, j, n)] = d_prime[j] - c_prime[j] * x_expected[idx<TypeParam>(i, j+1, n)];
+    }
+  }
+
+  // Set tolerance based on type
+  TypeParam tol = std::is_same<TypeParam, float>::value ?
+                 static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-10);
+
+  // Verify that individual values match
+  for (int j = 0; j < n; j++) {
+    for (int i = 0; i < n; i++) {
+      EXPECT_NEAR(x_h[idx<TypeParam>(i, j, n)], x_expected[idx<TypeParam>(i, j, n)], tol)
+          << "Mismatch at i=" << i << ", j=" << j;
+    }
+  }
+}
+
+// Random test for a partial region
+TYPED_TEST(TDMASolverTypedTestFixture, RandomPartial) {
+  const int n = this->n;
+  const int slev = 1;
+  const int elev = n - 1;
+  const int startidx = 1;
+  const int endidx = n - 1;
+
+  // Create host mirror views
+  auto a_h = Kokkos::create_mirror_view(this->a);
+  auto b_h = Kokkos::create_mirror_view(this->b);
+  auto c_h = Kokkos::create_mirror_view(this->c);
+  auto d_h = Kokkos::create_mirror_view(this->d);
+  auto x_h = Kokkos::create_mirror_view(this->x);
+
+  // Use fixed seed for reproducibility
+  std::mt19937 gen(43);
+  std::uniform_real_distribution<double> diag_dist(5.0, 10.0);
+  std::uniform_real_distribution<double> off_diag_dist(0.1, 2.0);
+  std::uniform_real_distribution<double> rhs_dist(-10.0, 10.0);
+
+  // Initialize all values to something that shouldn't be touched
+  for (int j = 0; j < n; j++) {
+    for (int i = 0; i < n; i++) {
+      a_h[idx<TypeParam>(i, j, n)] = static_cast<TypeParam>(-999.0);
+      b_h[idx<TypeParam>(i, j, n)] = static_cast<TypeParam>(-999.0);
+      c_h[idx<TypeParam>(i, j, n)] = static_cast<TypeParam>(-999.0);
+      d_h[idx<TypeParam>(i, j, n)] = static_cast<TypeParam>(-999.0);
+      x_h[idx<TypeParam>(i, j, n)] = static_cast<TypeParam>(0.0);
+    }
+  }
+
+  // Set random values only for the region to be processed
+  for (int j = slev; j < elev; j++) {
+    for (int i = startidx; i < endidx; i++) {
+      a_h[idx<TypeParam>(i, j, n)] = static_cast<TypeParam>(-off_diag_dist(gen));
+      b_h[idx<TypeParam>(i, j, n)] = static_cast<TypeParam>(diag_dist(gen));
+      c_h[idx<TypeParam>(i, j, n)] = static_cast<TypeParam>(-off_diag_dist(gen));
+      d_h[idx<TypeParam>(i, j, n)] = static_cast<TypeParam>(rhs_dist(gen));
+    }
+  }
+
+  // Save copies for reference solution
+  std::vector<TypeParam> a_copy(n * n, static_cast<TypeParam>(-999.0));
+  std::vector<TypeParam> b_copy(n * n, static_cast<TypeParam>(-999.0));
+  std::vector<TypeParam> c_copy(n * n, static_cast<TypeParam>(-999.0));
+  std::vector<TypeParam> d_copy(n * n, static_cast<TypeParam>(-999.0));
+  std::vector<TypeParam> x_expected(n * n, static_cast<TypeParam>(0.0));
+
+  for (int j = slev; j < elev; j++) {
+    for (int i = startidx; i < endidx; i++) {
+      a_copy[idx<TypeParam>(i, j, n)] = a_h[idx<TypeParam>(i, j, n)];
+      b_copy[idx<TypeParam>(i, j, n)] = b_h[idx<TypeParam>(i, j, n)];
+      c_copy[idx<TypeParam>(i, j, n)] = c_h[idx<TypeParam>(i, j, n)];
+      d_copy[idx<TypeParam>(i, j, n)] = d_h[idx<TypeParam>(i, j, n)];
+    }
+  }
+
+  // Copy to device
+  Kokkos::deep_copy(this->a, a_h);
+  Kokkos::deep_copy(this->b, b_h);
+  Kokkos::deep_copy(this->c, c_h);
+  Kokkos::deep_copy(this->d, d_h);
+  Kokkos::deep_copy(this->x, x_h);
+
+  // Call the solver for the partial region
+  tdma_solver_vec<TypeParam>(
+      this->a.data(), this->b.data(), this->c.data(), this->d.data(),
+      slev, elev, startidx, endidx, n, n, this->x.data());
+
+  // Copy results back to host
+  Kokkos::deep_copy(x_h, this->x);
+
+  // Calculate reference solution for the partial region
+  for (int i = startidx; i < endidx; i++) {
+    // Arrays for internal calculations
+    std::vector<TypeParam> c_prime(n, 0.0);
+    std::vector<TypeParam> d_prime(n, 0.0);
+
+    // Forward sweep
+    c_prime[slev] = c_copy[idx<TypeParam>(i, slev, n)] / b_copy[idx<TypeParam>(i, slev, n)];
+    d_prime[slev] = d_copy[idx<TypeParam>(i, slev, n)] / b_copy[idx<TypeParam>(i, slev, n)];
+
+    for (int j = slev + 1; j < elev; j++) {
+      TypeParam m = static_cast<TypeParam>(1.0) /
+                   (b_copy[idx<TypeParam>(i, j, n)] - c_prime[j-1] * a_copy[idx<TypeParam>(i, j, n)]);
+      c_prime[j] = c_copy[idx<TypeParam>(i, j, n)] * m;
+      d_prime[j] = (d_copy[idx<TypeParam>(i, j, n)] - d_prime[j-1] * a_copy[idx<TypeParam>(i, j, n)]) * m;
+    }
+
+    // Back substitution
+    x_expected[idx<TypeParam>(i, elev-1, n)] = d_prime[elev-1];
+
+    for (int j = elev-2; j >= slev; j--) {
+      x_expected[idx<TypeParam>(i, j, n)] = d_prime[j] - c_prime[j] * x_expected[idx<TypeParam>(i, j+1, n)];
+    }
+  }
+
+  // Set tolerance based on type
+  TypeParam tol = std::is_same<TypeParam, float>::value ?
+                 static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-10);
+
+  // Verify that individual values match
+  for (int j = slev; j < elev; j++) {
+    for (int i = startidx; i < endidx; i++) {
+      EXPECT_NEAR(x_h[idx<TypeParam>(i, j, n)], x_expected[idx<TypeParam>(i, j, n)], tol)
+          << "Mismatch at i=" << i << ", j=" << j;
+    }
+  }
+}
-- 
GitLab


From febab9e1b759a43249d1f46b80030bb86d983c25 Mon Sep 17 00:00:00 2001
From: Pradipta Samanta <samanta@dkrz.de>
Date: Wed, 9 Apr 2025 11:06:39 +0200
Subject: [PATCH 22/34] fixed a small linting issue

---
 test/c/CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/c/CMakeLists.txt b/test/c/CMakeLists.txt
index c93e30a..c0f7c59 100644
--- a/test/c/CMakeLists.txt
+++ b/test/c/CMakeLists.txt
@@ -26,7 +26,6 @@ message(CHECK_PASS "done")
 # Find Kokkos (or use your existing Kokkos installation)
 # find_package(Kokkos REQUIRED)
 
-
 set(SOURCES
   main.cpp
   test_horizontal_div.cpp
-- 
GitLab


From 20f7e7ca5a349189ceaa2b28da7ea152094fa416 Mon Sep 17 00:00:00 2001
From: Pradipta Samanta <samanta@dkrz.de>
Date: Wed, 9 Apr 2025 11:26:57 +0200
Subject: [PATCH 23/34] changed the tolerance for float in test_tdma_solver

changed few more tolerance values
---
 test/c/test_tdma_solver.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/c/test_tdma_solver.cpp b/test/c/test_tdma_solver.cpp
index 3a0165a..bb1dddf 100644
--- a/test/c/test_tdma_solver.cpp
+++ b/test/c/test_tdma_solver.cpp
@@ -107,7 +107,7 @@ TYPED_TEST(TDMASolverTypedTestFixture, SpecificFull) {
   // Expected reference sum (adjusted for precision)
   TypeParam sum_ref = static_cast<TypeParam>(27.2727272727272769);
   TypeParam tol = std::is_same<TypeParam, float>::value ?
-                 static_cast<TypeParam>(1e-6) : static_cast<TypeParam>(1e-13);
+                 static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13);
 
   EXPECT_NEAR(sum, sum_ref, tol);
 }
@@ -139,7 +139,7 @@ TYPED_TEST(TDMASolverTypedTestFixture, SpecificPartial) {
   // Expected reference sum (adjusted for precision)
   TypeParam sum_ref = static_cast<TypeParam>(17.7777777777777679);
   TypeParam tol = std::is_same<TypeParam, float>::value ?
-                 static_cast<TypeParam>(1e-6) : static_cast<TypeParam>(1e-13);
+                 static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13);
 
   EXPECT_NEAR(sum, sum_ref, tol);
 }
@@ -230,7 +230,7 @@ TYPED_TEST(TDMASolverTypedTestFixture, RandomFull) {
 
   // Set tolerance based on type
   TypeParam tol = std::is_same<TypeParam, float>::value ?
-                 static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-10);
+                 static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13);
 
   // Verify that individual values match
   for (int j = 0; j < n; j++) {
@@ -341,7 +341,7 @@ TYPED_TEST(TDMASolverTypedTestFixture, RandomPartial) {
 
   // Set tolerance based on type
   TypeParam tol = std::is_same<TypeParam, float>::value ?
-                 static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-10);
+                 static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13);
 
   // Verify that individual values match
   for (int j = slev; j < elev; j++) {
-- 
GitLab


From 5cee3e94f6ebd01998ee73afefb252b86a5f6913 Mon Sep 17 00:00:00 2001
From: Pradipta Samanta <samanta@dkrz.de>
Date: Wed, 9 Apr 2025 15:09:52 +0200
Subject: [PATCH 24/34] added type dependent tolerance for the tests which use
 random input arrays

---
 test/c/test_horizontal_div.cpp       | 29 +++++++++----
 test/c/test_horizontal_recon.cpp     | 30 +++++++++++---
 test/c/test_horizontal_rot.cpp       | 10 ++++-
 test/c/test_interpolation_scalar.cpp | 61 ++++++++++++++++++++--------
 test/c/test_interpolation_vector.cpp | 15 ++-----
 test/c/test_intp_rbf.cpp             | 33 ++++++++-------
 6 files changed, 120 insertions(+), 58 deletions(-)

diff --git a/test/c/test_horizontal_div.cpp b/test/c/test_horizontal_div.cpp
index cf5f320..06e6edd 100644
--- a/test/c/test_horizontal_div.cpp
+++ b/test/c/test_horizontal_div.cpp
@@ -238,11 +238,14 @@ TYPED_TEST(HorizontalDivTest, TestDiv3DRandom) {
     }
   }
 
+  TypeParam tol = std::is_same<TypeParam, float>::value ?
+               static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13);
+
   // Verify results
   for (int i = 0; i < nproma; ++i) {
     for (int k = 0; k < nlev; ++k) {
       EXPECT_NEAR(div_vec_c_h[div_vec_c_at(i, k, 0)],
-                  ref_div_vec_c[div_vec_c_at(i, k, 0)], 1e-5)
+                  ref_div_vec_c[div_vec_c_at(i, k, 0)], tol)
           << "Results differ at i=" << i << ", k=" << k;
     }
   }
@@ -468,11 +471,14 @@ TYPED_TEST(HorizontalDivTest, TestDiv3D2FRandom) {
     }
   }
 
+  TypeParam tol = std::is_same<TypeParam, float>::value ?
+               static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13);
+
   // Verify results for first field
   for (int i = 0; i < nproma; ++i) {
     for (int k = 0; k < nlev; ++k) {
       EXPECT_NEAR(div_vec_c_h[div_vec_c_at(i, k, 0)],
-                  ref_div_vec_c[div_vec_c_at(i, k, 0)], 1e-5)
+                  ref_div_vec_c[div_vec_c_at(i, k, 0)], tol)
           << "First field results differ at i=" << i << ", k=" << k;
     }
   }
@@ -481,7 +487,7 @@ TYPED_TEST(HorizontalDivTest, TestDiv3D2FRandom) {
   for (int i = 0; i < nproma; ++i) {
     for (int k = 0; k < nlev; ++k) {
       EXPECT_NEAR(f4dout_h[f4dout_at(i, k, 0, 0)],
-                  ref_f4dout[f4dout_at(i, k, 0, 0)], 1e-5)
+                  ref_f4dout[f4dout_at(i, k, 0, 0)], tol)
           << "Second field results differ at i=" << i << ", k=" << k;
     }
   }
@@ -625,6 +631,9 @@ TYPED_TEST(HorizontalDivTest, TestDiv4DRandom) {
   // Copy results back to host for verification
   Kokkos::deep_copy(f4dout_h, this->f4dout);
 
+  TypeParam tol = std::is_same<TypeParam, float>::value ?
+               static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13);
+
   // Compute reference result and check
   for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
     int i_startidx, i_endidx;
@@ -642,7 +651,7 @@ TYPED_TEST(HorizontalDivTest, TestDiv4DRandom) {
                         geofac_div_h[geofac_div_at(jc, je, jb)];
           }
 
-          EXPECT_NEAR(f4dout_h[f4dout_at(jc, jk, jb, ji)], expected, 1e-5)
+          EXPECT_NEAR(f4dout_h[f4dout_at(jc, jk, jb, ji)], expected, tol)
               << "Random test fails at jc=" << jc << ", jk=" << jk
               << ", jb=" << jb << ", ji=" << ji;
         }
@@ -998,15 +1007,18 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgRandom) {
     }
   }
 
+  TypeParam tol = std::is_same<TypeParam, float>::value ?
+               static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13);
+
   // Verify results
   for (int i = 0; i < nproma; ++i) {
     for (int k = 0; k < nlev; ++k) {
       EXPECT_NEAR(div_vec_c_h[div_vec_c_at(i, k, 0)],
-                  ref_div_vec_c[div_vec_c_at(i, k, 0)], 1e-5)
+                  ref_div_vec_c[div_vec_c_at(i, k, 0)], tol)
           << "div_vec_c results differ at i=" << i << ", k=" << k;
 
       EXPECT_NEAR(opt_out2_h[div_vec_c_at(i, k, 0)],
-                  ref_opt_out2[div_vec_c_at(i, k, 0)], 1e-5)
+                  ref_opt_out2[div_vec_c_at(i, k, 0)], tol)
           << "opt_out2 results differ at i=" << i << ", k=" << k;
     }
   }
@@ -1332,12 +1344,15 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgRandomNoL2fields) {
     }
   }
 
+  TypeParam tol = std::is_same<TypeParam, float>::value ?
+               static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13);
+
   // Verify results - only check div_vec_c since l2fields=false means opt_out2
   // isn't updated
   for (int i = 0; i < nproma; ++i) {
     for (int k = 0; k < nlev; ++k) {
       EXPECT_NEAR(div_vec_c_h[div_vec_c_at(i, k, 0)],
-                  ref_div_vec_c[div_vec_c_at(i, k, 0)], 1e-5)
+                  ref_div_vec_c[div_vec_c_at(i, k, 0)], tol)
           << "div_vec_c results differ at i=" << i << ", k=" << k;
     }
   }
diff --git a/test/c/test_horizontal_recon.cpp b/test/c/test_horizontal_recon.cpp
index d8ea721..0264805 100644
--- a/test/c/test_horizontal_recon.cpp
+++ b/test/c/test_horizontal_recon.cpp
@@ -342,11 +342,14 @@ TYPED_TEST(HorizontalReconLinearTest, TestLsqCellRandom) {
     }
   }
 
+  TypeParam tol = std::is_same<TypeParam, float>::value ?
+               static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13);
+
   // Check result
   for (int i = 0; i < lsq_dim_unk + 1; ++i) {
     for (int jc = 0; jc < nproma; ++jc) {
       EXPECT_NEAR(p_coeff_h[(p_coeff_at(i, jc, 0, 0))],
-                  p_result[(at<lsq_dim_unk + 1, nproma>(i, jc))], 1e-5)
+                  p_result[(at<lsq_dim_unk + 1, nproma>(i, jc))], tol)
           << "For loop result fails for i = " << i << ", jc = " << jc;
     }
   }
@@ -532,11 +535,14 @@ TYPED_TEST(HorizontalReconLinearTest, TestLsqCellSVDRandom) {
     }
   }
 
+  TypeParam tol = std::is_same<TypeParam, float>::value ?
+               static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13);
+
   // Check result
   for (int i = 0; i < lsq_dim_unk + 1; ++i) {
     for (int jc = 0; jc < nproma; ++jc) {
       EXPECT_NEAR(p_coeff_h[(p_coeff_at(i, jc, 0, 0))],
-                  p_result[(at<lsq_dim_unk + 1, nproma>(i, jc))], 1e-5)
+                  p_result[(at<lsq_dim_unk + 1, nproma>(i, jc))], tol)
           << "For loop result fails for i = " << i << ", jc = " << jc;
     }
   }
@@ -825,10 +831,13 @@ TYPED_TEST(HorizontalReconQuadraticTest, TestLsqCellRandom) {
     }
   }
 
+  TypeParam tol = std::is_same<TypeParam, float>::value ?
+               static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13);
+
   // Check result
   for (int i = 0; i < lsq_dim_unk + 1; ++i) {
     for (int jc = 0; jc < nproma; ++jc) {
-      EXPECT_NEAR(p_coeff_h[p_coeff_at(i, jc, 0, 0)], p_result_h(i, jc), 1e-5)
+      EXPECT_NEAR(p_coeff_h[p_coeff_at(i, jc, 0, 0)], p_result_h(i, jc), tol)
           << "For loop result fails for i = " << i << ", jc = " << jc;
     }
   }
@@ -1037,11 +1046,14 @@ TYPED_TEST(HorizontalReconQuadraticTest, TestLsqCellSVDRandom) {
     }
   }
 
+  TypeParam tol = std::is_same<TypeParam, float>::value ?
+               static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13);
+
   // Check result
   for (int j = 0; j < lsq_dim_unk + 1; ++j) {
     for (int jc = 0; jc < nproma; ++jc) {
       EXPECT_NEAR(p_coeff_h[(p_coeff_at(j, jc, 0, 0))],
-                  p_result[(at<lsq_dim_unk + 1, nproma>(j, jc))], 1e-5)
+                  p_result[(at<lsq_dim_unk + 1, nproma>(j, jc))], tol)
           << "For loop result fails for j = " << j << ", jc = " << jc;
     }
   }
@@ -1306,11 +1318,14 @@ TYPED_TEST(HorizontalReconCubicTest, TestLsqCellRandom) {
     }
   }
 
+  TypeParam tol = std::is_same<TypeParam, float>::value ?
+               static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13);
+
   // Check result
   for (int j = 0; j < lsq_dim_unk + 1; ++j) {
     for (int jc = 0; jc < nproma; ++jc) {
       EXPECT_NEAR(p_coeff_h[(p_coeff_at(j, jc, 0, 0))],
-                  p_result[(at<lsq_dim_unk + 1, nproma>(j, jc))], 1e-5)
+                  p_result[(at<lsq_dim_unk + 1, nproma>(j, jc))], tol)
           << "For loop result fails for j = " << j << ", jc = " << jc;
     }
   }
@@ -1537,11 +1552,14 @@ TYPED_TEST(HorizontalReconCubicTest, TestLsqCellSVDRandom) {
     }
   }
 
+  TypeParam tol = std::is_same<TypeParam, float>::value ?
+               static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13);
+
   // Check result
   for (int i = 0; i < lsq_dim_unk + 1; ++i) {
     for (int jc = 0; jc < nproma; ++jc) {
       EXPECT_NEAR(p_coeff_h[(p_coeff_at(i, jc, 0, 0))],
-                  p_result[(at<lsq_dim_unk + 1, nproma>(i, jc))], 1e-5)
+                  p_result[(at<lsq_dim_unk + 1, nproma>(i, jc))], tol)
           << "For loop result fails for i = " << i << ", jc = " << jc;
     }
   }
diff --git a/test/c/test_horizontal_rot.cpp b/test/c/test_horizontal_rot.cpp
index 69e9d03..f93cddd 100644
--- a/test/c/test_horizontal_rot.cpp
+++ b/test/c/test_horizontal_rot.cpp
@@ -247,11 +247,14 @@ TYPED_TEST(HorizontalRotVertexTest, TestRotVertexAtmosRandom) {
     }
   }
 
+  TypeParam tol = std::is_same<TypeParam, float>::value ?
+               static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13);
+
   // Verify results
   for (int i = 0; i < nproma; ++i) {
     for (int k = 0; k < nlev; ++k) {
       EXPECT_NEAR(rot_vec_h[rot_vec_at(i, k, 0)],
-                  ref_rot_vec[rot_vec_at(i, k, 0)], 1e-5)
+                  ref_rot_vec[rot_vec_at(i, k, 0)], tol)
           << "Results differ at i=" << i << ", k=" << k;
     }
   }
@@ -436,11 +439,14 @@ TYPED_TEST(HorizontalRotVertexTest, TestRotVertexRIRandom) {
     }
   }
 
+  TypeParam tol = std::is_same<TypeParam, float>::value ?
+               static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13);
+
   // Verify results
   for (int i = 0; i < nproma; ++i) {
     for (int k = 0; k < nlev; ++k) {
       EXPECT_NEAR(rot_vec_h[rot_vec_at(i, k, 0)],
-                  ref_rot_vec[rot_vec_at(i, k, 0)], 1e-5)
+                  ref_rot_vec[rot_vec_at(i, k, 0)], tol)
           << "Results differ at i=" << i << ", k=" << k << ")";
     }
   }
diff --git a/test/c/test_interpolation_scalar.cpp b/test/c/test_interpolation_scalar.cpp
index 94c33dd..c3158ad 100644
--- a/test/c/test_interpolation_scalar.cpp
+++ b/test/c/test_interpolation_scalar.cpp
@@ -209,6 +209,16 @@ TYPED_TEST(InterpolationScalarSingleParamTest, Verts2EdgesSpecific) {
   // Copy results back to host
   Kokkos::deep_copy(p_edge_out_h, this->p_edge_out);
 
+  // print results in one line
+  std::cout << "p_edge_out_h: ";
+  for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
+    for (int jk = this->slev; jk <= this->elev; ++jk) {
+      for (int jv = this->i_startidx; jv <= this->i_endidx; ++jv) {
+        std::cout << p_edge_out_h[edge_at(jv, jk, jb)] << " ";
+      }
+    }
+  }
+  std::cout << std::endl;
   // Expected results based on the specific test values
   std::vector<TypeParam> expected_edges(12);
   int idx = 0;
@@ -330,13 +340,15 @@ TYPED_TEST(InterpolationScalarSingleParamTest, Verts2EdgesRandom) {
     }
   }
 
+  TypeParam tol = std::is_same<TypeParam, float>::value ?
+               static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13);
+
   // Verify results
   for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
     for (int jk = this->slev; jk <= this->elev; ++jk) {
       for (int jv = this->i_startidx; jv <= this->i_endidx; ++jv) {
         EXPECT_NEAR(p_edge_out_h[edge_at(jv, jk, jb)], 
-                   expected_edges(jv, jk, jb), 
-                   static_cast<TypeParam>(1e-5))
+                   expected_edges(jv, jk, jb), tol)
             << "Failure at block " << jb << ", level " << jk << ", index " << jv;
       }
     }
@@ -534,13 +546,15 @@ TYPED_TEST(InterpolationScalarSingleParamTest, Edges2VertsRandom) {
     }
   }
 
+  TypeParam tol = std::is_same<TypeParam, float>::value ?
+               static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13);
+
   // Verify results
   for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
     for (int jk = this->slev; jk <= this->elev; ++jk) {
       for (int jv = this->i_startidx; jv <= this->i_endidx; ++jv) {
         EXPECT_NEAR(p_vert_out_h[vert_at(jv, jk, jb)], 
-                   expected_verts(jv, jk, jb), 
-                   static_cast<TypeParam>(1e-5))
+                   expected_verts(jv, jk, jb), tol)
             << "Failure at block " << jb << ", level " << jk << ", index " << jv;
       }
     }
@@ -738,13 +752,15 @@ TYPED_TEST(InterpolationScalarSingleParamTest, Edges2CellsRandom) {
     }
   }
 
+  TypeParam tol = std::is_same<TypeParam, float>::value ?
+               static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13);
+
   // Verify results
   for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
     for (int jk = this->slev; jk <= this->elev; ++jk) {
       for (int jc = this->i_startidx; jc <= this->i_endidx; ++jc) {
         EXPECT_NEAR(p_cell_out_h[cell_at(jc, jk, jb)], 
-                    expected_cells(jc, jk, jb), 
-                    static_cast<TypeParam>(1e-5))
+                    expected_cells(jc, jk, jb), tol)
             << "Failure at block " << jb << ", level " << jk << ", index " << jc;
       }
     }
@@ -947,14 +963,16 @@ TYPED_TEST(InterpolationScalarSingleParamTest, Verts2CellsRandom) {
     }
   }
 
+  TypeParam tol = std::is_same<TypeParam, float>::value ?
+               static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13);
+
   // Verify results
   for (int jb = 0; jb < nblks_c; ++jb) {
     for (int jk = this->slev; jk <= this->elev; ++jk) {
       int nlen = (jb != nblks_c - 1) ? nproma : npromz_c;
       for (int jc = 0; jc < nlen; ++jc) {
         EXPECT_NEAR(p_cell_out_h[cell_at(jc, jk, jb)], 
-                    expected_cells(jc, jk, jb), 
-                    static_cast<TypeParam>(1e-5))
+                    expected_cells(jc, jk, jb), tol)
             << "Failure at block " << jb << ", level " << jk << ", index " << jc;
       }
     }
@@ -1160,13 +1178,15 @@ TYPED_TEST(InterpolationScalarSingleParamTest, CellAvgLibRandom) {
     }
   }
 
+  TypeParam tol = std::is_same<TypeParam, float>::value ?
+               static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13);
+
   // Verify results
   for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
     for (int jk = this->slev; jk <= this->elev; ++jk) {
       for (int jc = this->i_startidx; jc <= this->i_endidx; ++jc) {
         EXPECT_NEAR(avg_psi_c_h[avg_at(jc, jk, jb)], 
-                    expected_avg(jc, jk, jb), 
-                    static_cast<TypeParam>(1e-5))
+                    expected_avg(jc, jk, jb), tol)
             << "Failure at block " << jb << ", level " << jk << ", index " << jc;
       }
     }
@@ -1452,6 +1472,9 @@ TYPED_TEST(InterpolationScalarDoubleParamTest, Cells2EdgesRandom) {
     }
   }
 
+  OutType tol = std::is_same<OutType, float>::value ?
+               static_cast<OutType>(1e-5) : static_cast<OutType>(1e-13);
+
   // Verify results
   for (int jb = i_startblk; jb <= i_endblk; ++jb) {
     int i_startidx, i_endidx;
@@ -1461,8 +1484,7 @@ TYPED_TEST(InterpolationScalarDoubleParamTest, Cells2EdgesRandom) {
     for (int jk = this->slev; jk <= this->elev; ++jk) {
       for (int je = i_startidx; je <= i_endidx; ++je) {
         EXPECT_NEAR(p_edge_out_h[edge_at(je, jk, jb)], 
-                    expected_edges(je, jk, jb), 
-                    static_cast<OutType>(1e-5))
+                    expected_edges(je, jk, jb), tol)
             << "Failure at block " << jb << ", level " << jk << ", index " << je;
       }
     }
@@ -1677,6 +1699,9 @@ TYPED_TEST(InterpolationScalarDoubleParamTest, Cells2VertsRandom) {
     }
   }
 
+  OutType tol = std::is_same<OutType, float>::value ?
+               static_cast<OutType>(1e-5) : static_cast<OutType>(1e-13);
+
   // Verify results
   for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
     int i_startidx, i_endidx;
@@ -1686,8 +1711,7 @@ TYPED_TEST(InterpolationScalarDoubleParamTest, Cells2VertsRandom) {
     for (int jk = this->slev; jk <= this->elev; ++jk) {
       for (int jv = i_startidx; jv <= i_endidx; ++jv) {
         EXPECT_NEAR(p_vert_out_h[vert_at(jv, jk, jb)], 
-                   expected_verts(jv, jk, jb), 
-                   static_cast<OutType>(1e-5))
+                   expected_verts(jv, jk, jb), tol)
             << "Failure at block " << jb << ", level " << jk << ", index " << jv;
       }
     }
@@ -1982,6 +2006,9 @@ TYPED_TEST(Cells2vertsriScalarLibTestFixture, Cells2VertsRIRandom) {
     }
   }
 
+  OutType tol = std::is_same<OutType, float>::value ?
+               static_cast<OutType>(1e-5) : static_cast<OutType>(1e-13);
+
   // Verify results - using the appropriate indexing depending on __LOOP_EXCHANGE
   for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
     int i_startidx, i_endidx;
@@ -1992,13 +2019,11 @@ TYPED_TEST(Cells2vertsriScalarLibTestFixture, Cells2VertsRIRandom) {
       for (int jv = i_startidx; jv <= i_endidx; ++jv) {
 #ifdef __LOOP_EXCHANGE
         EXPECT_NEAR(p_vert_out_h[vert_at(jv, jk, jb)], 
-                   expected_verts(jv, jk, jb), 
-                   static_cast<OutType>(1e-5))
+                   expected_verts(jv, jk, jb), tol)
             << "Failure at block " << jb << ", level " << jk << ", index " << jv;
 #else
         EXPECT_NEAR(p_vert_out_h[vert_at(jk, jv, jb)], 
-                   expected_verts(jv, jk, jb), 
-                   static_cast<OutType>(1e-5))
+                   expected_verts(jv, jk, jb), tol)
             << "Failure at block " << jb << ", level " << jk << ", index " << jv;
 #endif
       }
diff --git a/test/c/test_interpolation_vector.cpp b/test/c/test_interpolation_vector.cpp
index 497b1d5..0806e35 100644
--- a/test/c/test_interpolation_vector.cpp
+++ b/test/c/test_interpolation_vector.cpp
@@ -139,8 +139,6 @@ TYPED_TEST(InterpolationVectorTest, Edges2CellsSpecific) {
   Kokkos::deep_copy(this->p_u_out, p_u_out_h);
   Kokkos::deep_copy(this->p_v_out, p_v_out_h);
 
-  Kokkos::fence();
-
   // Call the function
   edges2cells_vector_lib<TypeParam>(
       this->p_vn_in.data(), this->p_vt_in.data(),
@@ -150,8 +148,6 @@ TYPED_TEST(InterpolationVectorTest, Edges2CellsSpecific) {
       this->i_startblk, this->i_endblk, this->i_startidx_in, this->i_endidx_in,
       this->slev, this->elev, nproma, nlev, nblks_e, nblks_c);
 
-  Kokkos::fence();
-
   // Copy results back to host
   Kokkos::deep_copy(p_u_out_h, this->p_u_out);
   Kokkos::deep_copy(p_v_out_h, this->p_v_out);
@@ -304,8 +300,6 @@ TYPED_TEST(InterpolationVectorTest, Edges2CellsRandom) {
   Kokkos::deep_copy(this->p_u_out, p_u_out_h);
   Kokkos::deep_copy(this->p_v_out, p_v_out_h);
 
-  Kokkos::fence();
-
   // Call the function
   edges2cells_vector_lib<TypeParam>(
       this->p_vn_in.data(), this->p_vt_in.data(),
@@ -315,8 +309,6 @@ TYPED_TEST(InterpolationVectorTest, Edges2CellsRandom) {
       this->i_startblk, this->i_endblk, this->i_startidx_in, this->i_endidx_in,
       this->slev, this->elev, nproma, nlev, nblks_e, nblks_c);
 
-  Kokkos::fence();
-
   // Copy results back to host
   Kokkos::deep_copy(p_u_out_h, this->p_u_out);
   Kokkos::deep_copy(p_v_out_h, this->p_v_out);
@@ -377,15 +369,16 @@ TYPED_TEST(InterpolationVectorTest, Edges2CellsRandom) {
     }
   }
 
-  Kokkos::fence();
+  TypeParam tol = std::is_same<TypeParam, float>::value ?
+                 static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13);
 
   // Verify results
   for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
     for (int jk = this->slev; jk <= this->elev; ++jk) {
       for (int jc = this->i_startidx_in; jc <= this->i_endidx_in; ++jc) {
-        EXPECT_NEAR(p_u_out_h[out_at(jc, jk, 0)], expected_u(jc, jk, 0), 1e-5)
+        EXPECT_NEAR(p_u_out_h[out_at(jc, jk, 0)], expected_u(jc, jk, 0), tol)
             << "u value mismatch at jc=" << jc << ", jk=" << jk;
-        EXPECT_NEAR(p_v_out_h[out_at(jc, jk, 0)], expected_v(jc, jk, 0), 1e-5)
+        EXPECT_NEAR(p_v_out_h[out_at(jc, jk, 0)], expected_v(jc, jk, 0), tol)
             << "v value mismatch at jc=" << jc << ", jk=" << jk;
       }
     }
diff --git a/test/c/test_intp_rbf.cpp b/test/c/test_intp_rbf.cpp
index af72e65..49c7dec 100644
--- a/test/c/test_intp_rbf.cpp
+++ b/test/c/test_intp_rbf.cpp
@@ -372,6 +372,9 @@ TYPED_TEST(RbfInterpolSingleParamTest, C2GradRandom) {
     }
   }
 
+  TypeParam tol = std::is_same<TypeParam, float>::value ?
+               static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13);
+
   // Verify results
   for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
     int i_startidx, i_endidx;
@@ -381,12 +384,10 @@ TYPED_TEST(RbfInterpolSingleParamTest, C2GradRandom) {
     for (int jk = this->slev; jk <= this->elev; ++jk) {
       for (int jc = i_startidx; jc <= i_endidx; ++jc) {
         EXPECT_NEAR(grad_x_h[grad_at(jc, jk, jb)], 
-                   expected_grad_x(jc, jk, jb), 
-                   static_cast<TypeParam>(1e-5))
+                   expected_grad_x(jc, jk, jb), tol)
             << "grad_x failure at block " << jb << ", level " << jk << ", index " << jc;
         EXPECT_NEAR(grad_y_h[grad_at(jc, jk, jb)], 
-                   expected_grad_y(jc, jk, jb), 
-                   static_cast<TypeParam>(1e-5))
+                   expected_grad_y(jc, jk, jb), tol)
             << "grad_y failure at block " << jb << ", level " << jk << ", index " << jc;
       }
     }
@@ -638,6 +639,9 @@ TYPED_TEST(RbfInterpolSingleParamTest, CellRandom) {
     }
   }
 
+  TypeParam tol = std::is_same<TypeParam, float>::value ?
+               static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13);
+
   // Verify results
   for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
     int i_startidx, i_endidx;
@@ -647,13 +651,11 @@ TYPED_TEST(RbfInterpolSingleParamTest, CellRandom) {
     for (int jk = this->slev; jk <= this->elev; ++jk) {
       for (int jc = i_startidx; jc <= i_endidx; ++jc) {
         EXPECT_NEAR(p_u_out_h[cell_at(jc, jk, jb)], 
-                   expected_u(jc, jk, jb), 
-                   static_cast<TypeParam>(1e-5))
+                   expected_u(jc, jk, jb), tol)
             << "u failure at block " << jb << ", level " << jk << ", index " << jc;
         
         EXPECT_NEAR(p_v_out_h[cell_at(jc, jk, jb)], 
-                   expected_v(jc, jk, jb), 
-                   static_cast<TypeParam>(1e-5))
+                   expected_v(jc, jk, jb), tol)
             << "v failure at block " << jb << ", level " << jk << ", index " << jc;
       }
     }
@@ -871,6 +873,9 @@ TYPED_TEST(RbfInterpolSingleParamTest, EdgeRandom) {
     }
   }
 
+  TypeParam tol = std::is_same<TypeParam, float>::value ?
+               static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13);
+
   // Verify results
   for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
     int i_startidx, i_endidx;
@@ -880,8 +885,7 @@ TYPED_TEST(RbfInterpolSingleParamTest, EdgeRandom) {
     for (int jk = this->slev; jk <= this->elev; ++jk) {
       for (int je = i_startidx; je <= i_endidx; ++je) {
         EXPECT_NEAR(p_vt_out_h[vt_at(je, jk, jb)], 
-                   expected_vt(je, jk, jb), 
-                   static_cast<TypeParam>(1e-5))
+                   expected_vt(je, jk, jb), tol)
             << "Tangential velocity failure at block " << jb << ", level " << jk << ", index " << je;
       }
     }
@@ -1169,6 +1173,9 @@ TYPED_TEST(RbfVecInterpolDoubleParamTest, VertexRandom) {
     }
   }
 
+  OutType tol = std::is_same<OutType, float>::value ?
+               static_cast<OutType>(1e-5) : static_cast<OutType>(1e-13);
+
   // Verify results
   for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
     int i_startidx, i_endidx;
@@ -1178,12 +1185,10 @@ TYPED_TEST(RbfVecInterpolDoubleParamTest, VertexRandom) {
     for (int jk = this->slev; jk <= this->elev; ++jk) {
       for (int jv = i_startidx; jv <= i_endidx; ++jv) {
         EXPECT_NEAR(p_u_out_h[vert_at(jv, jk, jb)], 
-                   expected_u(jv, jk, jb), 
-                   static_cast<OutType>(1e-5))
+                   expected_u(jv, jk, jb), tol)
             << "u failure at block " << jb << ", level " << jk << ", index " << jv;
         EXPECT_NEAR(p_v_out_h[vert_at(jv, jk, jb)], 
-                   expected_v(jv, jk, jb), 
-                   static_cast<OutType>(1e-5))
+                   expected_v(jv, jk, jb), tol)
             << "v failure at block " << jb << ", level " << jk << ", index " << jv;
       }
     }
-- 
GitLab


From 81a8a322c5c1fa9a1cfef06a51720da305b3da07 Mon Sep 17 00:00:00 2001
From: Pradipta Samanta <samanta@dkrz.de>
Date: Mon, 14 Apr 2025 14:14:31 +0200
Subject: [PATCH 25/34] made some changes to unify the way loops are defined

---
 .../mo_lib_interpolation_scalar.cpp            | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/interpolation/mo_lib_interpolation_scalar.cpp b/src/interpolation/mo_lib_interpolation_scalar.cpp
index 51edcda..6b761dc 100644
--- a/src/interpolation/mo_lib_interpolation_scalar.cpp
+++ b/src/interpolation/mo_lib_interpolation_scalar.cpp
@@ -52,7 +52,7 @@ void verts2edges_scalar_lib(const T *p_vertex_in, const int *edge_vertex_idx,
   UnmanagedConstT3D coeff_int_view(coeff_int, nproma, 2, nblks_e);
   UnmanagedT3D p_edge_out_view(p_edge_out, nproma, nlev, nblks_e);
 
-  for (int jb = i_startblk; jb < i_endblk + 1; ++jb) {
+  for (int jb = i_startblk; jb <= i_endblk; ++jb) {
 
     int i_startidx, i_endidx;
     get_indices_c_lib(i_startidx_in, i_endidx_in, nproma, jb, i_startblk,
@@ -117,7 +117,7 @@ void cells2edges_scalar_lib(const T *p_cell_in, const int *edge_cell_idx,
     i_startblk = i_startblk_in[0];
     i_endblk = i_endblk_in[0];
 
-    for (int jb = i_startblk; jb < i_endblk + 1; ++jb) {
+    for (int jb = i_startblk; jb <= i_endblk; ++jb) {
 
       int i_startidx, i_endidx;
       get_indices_e_lib(i_startidx_in[0], i_endidx_in[0], nproma, jb,
@@ -147,7 +147,7 @@ void cells2edges_scalar_lib(const T *p_cell_in, const int *edge_cell_idx,
     i_startblk = i_startblk_in[1];
     i_endblk = i_endblk_in[1];
 
-    for (int jb = i_startblk; jb < i_endblk + 1; ++jb) {
+    for (int jb = i_startblk; jb <= i_endblk; ++jb) {
 
       int i_startidx, i_endidx;
       get_indices_e_lib(i_startidx_in[1], i_endidx_in[1], nproma, jb,
@@ -204,7 +204,7 @@ void edges2verts_scalar_lib(const T *p_edge_in, const int *vert_edge_idx,
   UnmanagedConstT3D v_int_view(v_int, nproma, 6, nblks_v);
   UnmanagedT3D p_vert_out_view(p_vert_out, nproma, nlev, nblks_v);
 
-  for (int jb = i_startblk; jb < i_endblk + 1; ++jb) {
+  for (int jb = i_startblk; jb <= i_endblk; ++jb) {
 
     int i_startidx, i_endidx;
     get_indices_v_lib(i_startidx_in, i_endidx_in, nproma, jb, i_startblk,
@@ -277,7 +277,7 @@ void edges2cells_scalar_lib(const T *p_edge_in, const int *edge_idx,
 
   int i_startidx, i_endidx;
 
-  for (int jb = i_startblk; jb < i_endblk + 1; ++jb) {
+  for (int jb = i_startblk; jb <= i_endblk; ++jb) {
     get_indices_c_lib(i_startidx_in, i_endidx_in, nproma, jb, i_startblk,
                       i_endblk, i_startidx, i_endidx);
 
@@ -345,7 +345,7 @@ void cells2verts_scalar_lib(const T *p_cell_in, const int *vert_cell_idx,
 
   int i_startidx, i_endidx;
 
-  for (int jb = i_startblk; jb < i_endblk + 1; ++jb) {
+  for (int jb = i_startblk; jb <= i_endblk; ++jb) {
 
     get_indices_v_lib(i_startidx_in, i_endidx_in, nproma, jb, i_startblk,
                       i_endblk, i_startidx, i_endidx);
@@ -426,7 +426,7 @@ void cells2verts_scalar_ri_lib(const T *p_cell_in, const int *vert_cell_idx,
 
   int i_startidx, i_endidx;
 
-  for (int jb = i_startblk; jb < i_endblk + 1; ++jb) {
+  for (int jb = i_startblk; jb <= i_endblk; ++jb) {
 
     get_indices_v_lib(i_startidx_in, i_endidx_in, nproma, jb, i_startblk,
                       i_endblk, i_startidx, i_endidx);
@@ -568,7 +568,7 @@ void cell_avg_lib(const T *psi_c, const int *cell_neighbor_idx,
   // block indices of triangles next to each cell, dim: (nproma,nblks_c, 3)
   UnmanagedConstInt3D iblk_view(cell_neighbor_blk, nproma, nblks_c,
                                 3); // cell_neighbour_blk
-  // averaging coefficients, dim: (nproma,nlev,nblks_c)
+  // averaging coefficients, dim: (nproma, 4, nblks_c)
   UnmanagedConstT3D avg_coeff_view(avg_coeff, nproma, 4, nblks_c);
 
   // cell based variable after averaging, dim: (nproma,nlev,nblks_c)
@@ -576,7 +576,7 @@ void cell_avg_lib(const T *psi_c, const int *cell_neighbor_idx,
 
   int i_startidx, i_endidx;
 
-  for (int jb = i_startblk; jb < i_endblk + 1; ++jb) {
+  for (int jb = i_startblk; jb <= i_endblk; ++jb) {
     get_indices_c_lib(i_startidx_in, i_endidx_in, nproma, jb, i_startblk,
                       i_endblk, i_startidx, i_endidx);
 
-- 
GitLab


From 6eb05ce5500298f21e8b6da6110a15399f41b005 Mon Sep 17 00:00:00 2001
From: Pradipta Samanta <samanta@dkrz.de>
Date: Mon, 14 Apr 2025 14:14:54 +0200
Subject: [PATCH 26/34] removed some redundant lines

---
 src/horizontal/mo_lib_divrot.cpp | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/src/horizontal/mo_lib_divrot.cpp b/src/horizontal/mo_lib_divrot.cpp
index dbbef65..0138cc0 100644
--- a/src/horizontal/mo_lib_divrot.cpp
+++ b/src/horizontal/mo_lib_divrot.cpp
@@ -255,10 +255,7 @@ void recon_lsq_cell_q(const T *p_cc, const int *lsq_idx_c, const int *lsq_blk_c,
                    p_cc_view(jc, jk, jb);
           z_d[8] = p_cc_view(iidx(jc, jb, 8), jk, iblk(jc, jb, 8)) -
                            p_cc_view(jc, jk, jb);
-    //     });
-    // Kokkos::parallel_for(
-    //     "recon_lsq_cell_q_step2", innerPolicy,
-    //     KOKKOS_LAMBDA(const int jk, const int jc) {
+
           z_qt_times_d[0] = lsq_qtmat_c_view(jc, 0, 0, jb) * z_d[0] +
                             lsq_qtmat_c_view(jc, 0, 1, jb) * z_d[1] +
                             lsq_qtmat_c_view(jc, 0, 2, jb) * z_d[2] +
@@ -413,10 +410,7 @@ void recon_lsq_cell_q_svd(const T *p_cc, const int *lsq_idx_c,
                            p_cc_view(jc, jk, jb);
           z_b[8] = p_cc_view(iidx(jc, jb, 8), jk, iblk(jc, jb, 8)) -
                            p_cc_view(jc, jk, jb);
-        // });
-    // Kokkos::parallel_for(
-    //     "recon_lsq_cell_q_svd_step2", innerPolicy,
-    //     KOKKOS_LAMBDA(const int jk, const int jc) {
+
           p_coeff_view(5, jc, jk, jb) =
               lsq_pseudoinv_view(jc, 4, 0, jb) * z_b[0] +
               lsq_pseudoinv_view(jc, 4, 1, jb) * z_b[1] +
@@ -558,10 +552,7 @@ void recon_lsq_cell_c(const T *p_cc, const int *lsq_idx_c, const int *lsq_blk_c,
                            p_cc_view(jc, jk, jb);
           z_d[8] = p_cc_view(iidx(jc, jb, 8), jk, iblk(jc, jb, 8)) -
                            p_cc_view(jc, jk, jb);
-    //     });
-    // Kokkos::parallel_for(
-    //     "recon_lsq_cell_c_step2", innerPolicy,
-    //     KOKKOS_LAMBDA(const int jk, const int jc) {
+
           z_qt_times_d[0] = lsq_qtmat_c_view(jc, 0, 0, jb) * z_d[0] +
                             lsq_qtmat_c_view(jc, 0, 1, jb) * z_d[1] +
                             lsq_qtmat_c_view(jc, 0, 2, jb) * z_d[2] +
@@ -1023,8 +1014,6 @@ void div4d(const int *cell_edge_idx, const int *cell_edge_blk,
   typedef Kokkos::View<const T ****, Kokkos::LayoutLeft,
                        Kokkos::MemoryUnmanaged>
       UnmanagedConstT4D;
-  typedef Kokkos::View<T ***, Kokkos::LayoutLeft, Kokkos::MemoryUnmanaged>
-      UnmanagedT3D;
   typedef Kokkos::View<T ****, Kokkos::LayoutLeft, Kokkos::MemoryUnmanaged>
       UnmanagedT4D;
   typedef Kokkos::View<const int ***, Kokkos::LayoutLeft,
-- 
GitLab


From ce1cfd347a0e5da340738d94319a04b12a9e2f42 Mon Sep 17 00:00:00 2001
From: Pradipta Samanta <samanta@dkrz.de>
Date: Mon, 14 Apr 2025 14:15:44 +0200
Subject: [PATCH 27/34] removed some redundant print statements

---
 test/c/test_interpolation_scalar.cpp | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/test/c/test_interpolation_scalar.cpp b/test/c/test_interpolation_scalar.cpp
index c3158ad..c84f4fe 100644
--- a/test/c/test_interpolation_scalar.cpp
+++ b/test/c/test_interpolation_scalar.cpp
@@ -209,15 +209,6 @@ TYPED_TEST(InterpolationScalarSingleParamTest, Verts2EdgesSpecific) {
   // Copy results back to host
   Kokkos::deep_copy(p_edge_out_h, this->p_edge_out);
 
-  // print results in one line
-  std::cout << "p_edge_out_h: ";
-  for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
-    for (int jk = this->slev; jk <= this->elev; ++jk) {
-      for (int jv = this->i_startidx; jv <= this->i_endidx; ++jv) {
-        std::cout << p_edge_out_h[edge_at(jv, jk, jb)] << " ";
-      }
-    }
-  }
   std::cout << std::endl;
   // Expected results based on the specific test values
   std::vector<TypeParam> expected_edges(12);
-- 
GitLab


From 77aa898000cf8d1e2f60b4ddb47c0cf513aed133 Mon Sep 17 00:00:00 2001
From: Pradipta Samanta <samanta@dkrz.de>
Date: Mon, 14 Apr 2025 14:16:06 +0200
Subject: [PATCH 28/34] removed declaration of unused integers

---
 test/c/test_horizontal_div.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/test/c/test_horizontal_div.cpp b/test/c/test_horizontal_div.cpp
index 06e6edd..9f7f4c0 100644
--- a/test/c/test_horizontal_div.cpp
+++ b/test/c/test_horizontal_div.cpp
@@ -665,7 +665,6 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgSpecific) {
   constexpr int nlev = this->nlev;
   constexpr int nblks_c = this->nblks_c;
   constexpr int nblks_e = this->nblks_e;
-  constexpr int dim4d = this->dim4d;
 
   const auto &vec_e_at = at<nproma, nlev, nblks_e>;
   const auto &cell_edge_at = at<nproma, nblks_c, 3>;
@@ -1029,7 +1028,6 @@ TYPED_TEST(HorizontalDivTest, TestDivAvgSpecificNoL2fields) {
   constexpr int nlev = this->nlev;
   constexpr int nblks_c = this->nblks_c;
   constexpr int nblks_e = this->nblks_e;
-  constexpr int dim4d = this->dim4d;
 
   const auto &vec_e_at = at<nproma, nlev, nblks_e>;
   const auto &cell_edge_at = at<nproma, nblks_c, 3>;
-- 
GitLab


From 075c960034a3487a27f68405cd0a7417360790c2 Mon Sep 17 00:00:00 2001
From: Pradipta Samanta <samanta@dkrz.de>
Date: Mon, 14 Apr 2025 14:16:49 +0200
Subject: [PATCH 29/34] reverted back some unnecessary changes

---
 test/c/test_horizontal_recon.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/test/c/test_horizontal_recon.cpp b/test/c/test_horizontal_recon.cpp
index 0264805..32d084a 100644
--- a/test/c/test_horizontal_recon.cpp
+++ b/test/c/test_horizontal_recon.cpp
@@ -1050,11 +1050,11 @@ TYPED_TEST(HorizontalReconQuadraticTest, TestLsqCellSVDRandom) {
                static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13);
 
   // Check result
-  for (int j = 0; j < lsq_dim_unk + 1; ++j) {
+  for (int i = 0; i < lsq_dim_unk + 1; ++i) {
     for (int jc = 0; jc < nproma; ++jc) {
-      EXPECT_NEAR(p_coeff_h[(p_coeff_at(j, jc, 0, 0))],
-                  p_result[(at<lsq_dim_unk + 1, nproma>(j, jc))], tol)
-          << "For loop result fails for j = " << j << ", jc = " << jc;
+      EXPECT_NEAR(p_coeff_h[(p_coeff_at(i, jc, 0, 0))],
+                  p_result[(at<lsq_dim_unk + 1, nproma>(i, jc))], tol)
+          << "For loop result fails for i = " << i << ", jc = " << jc;
     }
   }
 }
@@ -1322,11 +1322,11 @@ TYPED_TEST(HorizontalReconCubicTest, TestLsqCellRandom) {
                static_cast<TypeParam>(1e-5) : static_cast<TypeParam>(1e-13);
 
   // Check result
-  for (int j = 0; j < lsq_dim_unk + 1; ++j) {
+  for (int i = 0; i < lsq_dim_unk + 1; ++i) {
     for (int jc = 0; jc < nproma; ++jc) {
-      EXPECT_NEAR(p_coeff_h[(p_coeff_at(j, jc, 0, 0))],
-                  p_result[(at<lsq_dim_unk + 1, nproma>(j, jc))], tol)
-          << "For loop result fails for j = " << j << ", jc = " << jc;
+      EXPECT_NEAR(p_coeff_h[(p_coeff_at(i, jc, 0, 0))],
+                  p_result[(at<lsq_dim_unk + 1, nproma>(i, jc))], tol)
+          << "For loop result fails for i = " << i << ", jc = " << jc;
     }
   }
 }
-- 
GitLab


From 25b45d792a1bd72b5d90547dbb54f244424bf0ca Mon Sep 17 00:00:00 2001
From: Pradipta Samanta <samanta@dkrz.de>
Date: Mon, 14 Apr 2025 14:17:20 +0200
Subject: [PATCH 30/34] fixed issues with a test routine in
 test_horizontal_recon

---
 test/c/test_horizontal_recon.cpp | 103 ++++++++++++++-----------------
 1 file changed, 45 insertions(+), 58 deletions(-)

diff --git a/test/c/test_horizontal_recon.cpp b/test/c/test_horizontal_recon.cpp
index 32d084a..b83886c 100644
--- a/test/c/test_horizontal_recon.cpp
+++ b/test/c/test_horizontal_recon.cpp
@@ -743,89 +743,75 @@ TYPED_TEST(HorizontalReconQuadraticTest, TestLsqCellRandom) {
   // Copy results back to host
   Kokkos::deep_copy(p_coeff_h, this->p_coeff);
 
-  // Create host views for reference computation
-  using host_space = Kokkos::HostSpace;
-
-  // Arrays for intermediate calculations
-  Kokkos::View<TypeParam ***, host_space> z_d_h("z_d_h", lsq_dim_c, nproma,
-                                                nlev);
-  Kokkos::View<TypeParam *, host_space> z_qt_times_d_h("z_qt_times_d_h",
-                                                       lsq_dim_unk);
+  // Compute reference result
+  std::vector<TypeParam> z_d(lsq_dim_c);
+  std::vector<TypeParam> z_qt_times_d(lsq_dim_unk);
+  std::vector<TypeParam> p_result((lsq_dim_unk + 1) * nproma);
 
-  // Result view
-  Kokkos::View<TypeParam **, host_space> p_result_h("p_result_h",
-                                                    lsq_dim_unk + 1, nproma);
+  for (int i = 0; i < nproma; ++i) {
+    for (int j = 0; j < lsq_dim_unk + 1; ++j) {
+      p_result[(at<lsq_dim_unk + 1, nproma>(j, i))] = static_cast<TypeParam>(0.0);
+    }
+  }
 
-  // calculating only for jb=0
+  // doing the calculation only for jb=0
   for (int jb = this->i_startblk; jb < this->i_endblk; ++jb) {
     int i_startidx, i_endidx;
     get_indices_c_lib(this->i_startidx_in, this->i_endidx_in, nproma, jb,
                       this->i_startblk, this->i_endblk, i_startidx, i_endidx);
 
-    // Step 1: Calculate z_d values (matches the "recon_lsq_cell_q_step1"
-    // parallel_for)
-    // calculating only for jk = 0
+    // Step 1: Calculate z_d values
     for (int jk = this->slev; jk < this->elev; ++jk) {
       for (int jc = i_startidx; jc <= i_endidx; ++jc) {
         for (int i = 0; i < lsq_dim_c; ++i) {
-          z_d_h(i, jc, jk) =
-              p_cc_h[p_cc_at(
-                  cell_neighbor_idx_h[cell_neighbor_at(jc, jb, i)], jk,
-                  cell_neighbor_blk_h[cell_neighbor_at(jc, jb, i)])] -
-              p_cc_h[p_cc_at(jc, jk, jb)];
+          z_d[i] = p_cc_h[p_cc_at(
+                       cell_neighbor_idx_h[cell_neighbor_at(jc, jb, i)], jk,
+                       cell_neighbor_blk_h[cell_neighbor_at(jc, jb, i)])] -
+                   p_cc_h[p_cc_at(jc, jk, jb)];
         }
-      }
-    }
 
-    // Step 2: Calculate coefficients (matches the "recon_lsq_cell_q_step2"
-    // parallel_for)
-    // calculating only for jk = 0
-    for (int jk = this->slev; jk < this->elev; ++jk) {
-      for (int jc = i_startidx; jc <= i_endidx; ++jc) {
         // Matrix multiplication (Q^T * d)
         for (int j = 0; j < lsq_dim_unk; ++j) {
-          z_qt_times_d_h(j) = 0.0;
+          z_qt_times_d[j] = 0.0;
           for (int i = 0; i < lsq_dim_c; ++i) {
-            z_qt_times_d_h(j) +=
-                lsq_qtmat_c_h[qtmat_at(jc, j, i, jb)] * z_d_h(i, jc, jk);
+            z_qt_times_d[j] +=
+                lsq_qtmat_c_h[qtmat_at(jc, j, i, jb)] * z_d[i];
           }
         }
 
         // Back-substitution (mirrors the order in the GPU implementation)
-        p_result_h(5, jc) =
-            lsq_rmat_rdiag_c_h[rmat_rdiag_at(jc, 4, jb)] * z_qt_times_d_h(4);
+        p_result[at<lsq_dim_unk + 1, nproma>(5, jc)] =
+            lsq_rmat_rdiag_c_h[rmat_rdiag_at(jc, 4, jb)] * z_qt_times_d[4];
 
-        p_result_h(4, jc) =
+        p_result[at<lsq_dim_unk + 1, nproma>(4, jc)] =
             lsq_rmat_rdiag_c_h[rmat_rdiag_at(jc, 3, jb)] *
-            (z_qt_times_d_h(3) -
-             lsq_rmat_utri_c_h[rmat_utri_at(jc, 0, jb)] * p_result_h(5, jc));
-
-        p_result_h(3, jc) =
+            (z_qt_times_d[3] -
+             lsq_rmat_utri_c_h[rmat_utri_at(jc, 0, jb)] * p_result[at<lsq_dim_unk + 1, nproma>(5, jc)]);
+        p_result[at<lsq_dim_unk + 1, nproma>(3, jc)] =
             lsq_rmat_rdiag_c_h[rmat_rdiag_at(jc, 2, jb)] *
-            (z_qt_times_d_h(2) -
-             lsq_rmat_utri_c_h[rmat_utri_at(jc, 1, jb)] * p_result_h(4, jc) -
-             lsq_rmat_utri_c_h[rmat_utri_at(jc, 2, jb)] * p_result_h(5, jc));
-
-        p_result_h(2, jc) =
+            (z_qt_times_d[2] -
+             lsq_rmat_utri_c_h[rmat_utri_at(jc, 1, jb)] * p_result[at<lsq_dim_unk + 1, nproma>(4, jc)] -
+             lsq_rmat_utri_c_h[rmat_utri_at(jc, 2, jb)] * p_result[at<lsq_dim_unk + 1, nproma>(5, jc)]);
+        p_result[at<lsq_dim_unk + 1, nproma>(2, jc)] =
             lsq_rmat_rdiag_c_h[rmat_rdiag_at(jc, 1, jb)] *
-            (z_qt_times_d_h(1) -
-             lsq_rmat_utri_c_h[rmat_utri_at(jc, 3, jb)] * p_result_h(3, jc) -
-             lsq_rmat_utri_c_h[rmat_utri_at(jc, 4, jb)] * p_result_h(4, jc) -
-             lsq_rmat_utri_c_h[rmat_utri_at(jc, 5, jb)] * p_result_h(5, jc));
-
-        p_result_h(1, jc) =
+            (z_qt_times_d[1] -
+             lsq_rmat_utri_c_h[rmat_utri_at(jc, 3, jb)] * p_result[at<lsq_dim_unk + 1, nproma>(3, jc)] -
+             lsq_rmat_utri_c_h[rmat_utri_at(jc, 4, jb)] * p_result[at<lsq_dim_unk + 1, nproma>(4, jc)] -
+             lsq_rmat_utri_c_h[rmat_utri_at(jc, 5, jb)] * p_result[at<lsq_dim_unk + 1, nproma>(5, jc)]);
+        p_result[at<lsq_dim_unk + 1, nproma>(1, jc)] =
             lsq_rmat_rdiag_c_h[rmat_rdiag_at(jc, 0, jb)] *
-            (z_qt_times_d_h(0) -
-             lsq_rmat_utri_c_h[rmat_utri_at(jc, 6, jb)] * p_result_h(2, jc) -
-             lsq_rmat_utri_c_h[rmat_utri_at(jc, 7, jb)] * p_result_h(3, jc) -
-             lsq_rmat_utri_c_h[rmat_utri_at(jc, 8, jb)] * p_result_h(4, jc) -
-             lsq_rmat_utri_c_h[rmat_utri_at(jc, 9, jb)] * p_result_h(5, jc));
-
+            (z_qt_times_d[0] -
+             lsq_rmat_utri_c_h[rmat_utri_at(jc, 6, jb)] * p_result[at<lsq_dim_unk + 1, nproma>(2, jc)] -
+             lsq_rmat_utri_c_h[rmat_utri_at(jc, 7, jb)] * p_result[at<lsq_dim_unk + 1, nproma>(3, jc)] -
+             lsq_rmat_utri_c_h[rmat_utri_at(jc, 8, jb)] * p_result[at<lsq_dim_unk + 1, nproma>(4, jc)] -
+             lsq_rmat_utri_c_h[rmat_utri_at(jc, 9, jb)] * p_result[at<lsq_dim_unk + 1, nproma>(5, jc)]);
         // Conservation correction
-        p_result_h(0, jc) = p_cc_h[p_cc_at(jc, jk, jb)];
+        p_result[at<lsq_dim_unk + 1, nproma>(0, jc)] =
+            p_cc_h[p_cc_at(jc, jk, jb)];
         for (int j = 0; j < lsq_dim_unk; ++j) {
-          p_result_h(0, jc) -=
-              p_result_h(j + 1, jc) * lsq_moments_h[moments_at(jc, jb, j)];
+          p_result[at<lsq_dim_unk + 1, nproma>(0, jc)] -=
+              p_result[at<lsq_dim_unk + 1, nproma>(j + 1, jc)] *
+              lsq_moments_h[moments_at(jc, jb, j)];
         }
       }
     }
@@ -837,7 +823,8 @@ TYPED_TEST(HorizontalReconQuadraticTest, TestLsqCellRandom) {
   // Check result
   for (int i = 0; i < lsq_dim_unk + 1; ++i) {
     for (int jc = 0; jc < nproma; ++jc) {
-      EXPECT_NEAR(p_coeff_h[p_coeff_at(i, jc, 0, 0)], p_result_h(i, jc), tol)
+      EXPECT_NEAR(p_coeff_h[(p_coeff_at(i, jc, 0, 0))],
+                  p_result[(at<lsq_dim_unk + 1, nproma>(i, jc))], tol)
           << "For loop result fails for i = " << i << ", jc = " << jc;
     }
   }
-- 
GitLab


From d499e6926fa483a38f2c2208146396870b3cd975 Mon Sep 17 00:00:00 2001
From: Pradipta Samanta <samanta@dkrz.de>
Date: Mon, 14 Apr 2025 14:23:05 +0200
Subject: [PATCH 31/34] changed the name of a local variable in
 mo_lib_loopindices

---
 src/support/mo_lib_loopindices.cpp | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/support/mo_lib_loopindices.cpp b/src/support/mo_lib_loopindices.cpp
index 8c8d318..fcc31b6 100644
--- a/src/support/mo_lib_loopindices.cpp
+++ b/src/support/mo_lib_loopindices.cpp
@@ -19,19 +19,19 @@ void get_indices_c_lib(const int i_startidx_in, const int i_endidx_in, const int
     //Since code is ported incrementally from Fortran to C++, depending on where the function is called from
     //(either fortran or c++), the first index should be either 0 or 1.
     int first_index;
-    int nproma_loc;
+    int i_endidx_loc;
     if (called_from_cpp){
         first_index = 0;
-        nproma_loc = nproma - 1;
+        i_endidx_loc = nproma - 1;
     }
     else {
         first_index = 1;
-        nproma_loc = nproma;
+        i_endidx_loc = nproma;
     }
 
     if (i_blk == i_startblk) {
         i_startidx_out = std::max(first_index, i_startidx_in);
-        i_endidx_out = nproma_loc;
+        i_endidx_out = i_endidx_loc;
         if (i_blk == i_endblk) {
             i_endidx_out = i_endidx_in;
         }
@@ -40,7 +40,7 @@ void get_indices_c_lib(const int i_startidx_in, const int i_endidx_in, const int
         i_endidx_out = i_endidx_in;
     } else {
         i_startidx_out = first_index;
-        i_endidx_out = nproma_loc;
+        i_endidx_out = i_endidx_loc;
     }
 }
 
@@ -52,18 +52,18 @@ void get_indices_e_lib(const int i_startidx_in, const int i_endidx_in, const int
     //Since code is ported incrementally from Fortran to C++, depending on where the function is called from,
     //the first index should be either 0 or 1.
     int first_index;
-    int nproma_loc;
+    int i_endidx_loc;
     if (called_from_cpp) {
         first_index = 0;
-        nproma_loc = nproma - 1;
+        i_endidx_loc = nproma - 1;
     }
     else {
         first_index = 1;
-        nproma_loc = nproma;
+        i_endidx_loc = nproma;
     }
 
     i_startidx_out = (i_blk != i_startblk) ? first_index : std::max(first_index, i_startidx_in);
-    i_endidx_out = (i_blk != i_endblk) ? nproma_loc : i_endidx_in;
+    i_endidx_out = (i_blk != i_endblk) ? i_endidx_loc : i_endidx_in;
 }
 
 // get_indices_v_lib function
@@ -74,19 +74,19 @@ void get_indices_v_lib(const int i_startidx_in, const int i_endidx_in, const int
     //Since code is ported incrementally from Fortran to C++, depending on where the function is called from,
     //the first index should be either 0 or 1.
     int first_index;
-    int nproma_loc;
+    int i_endidx_loc;
     if (called_from_cpp) {
         first_index = 0;
-        nproma_loc = nproma - 1;
+        i_endidx_loc = nproma - 1;
     }
     else {
         first_index = 1;
-        nproma_loc = nproma;
+        i_endidx_loc = nproma;
     }
 
     if (i_blk == i_startblk) {
         i_startidx_out = i_startidx_in;
-        i_endidx_out = nproma_loc;
+        i_endidx_out = i_endidx_loc;
         if (i_blk == i_endblk) {
             i_endidx_out = i_endidx_in;
         }
@@ -95,6 +95,6 @@ void get_indices_v_lib(const int i_startidx_in, const int i_endidx_in, const int
         i_endidx_out = i_endidx_in;
     } else {
         i_startidx_out = first_index;
-        i_endidx_out = nproma_loc;
+        i_endidx_out = i_endidx_loc;
     }
 }
-- 
GitLab


From 57f9fd4482196a0ea5c8f7599f20633f2940f068 Mon Sep 17 00:00:00 2001
From: Pradipta Samanta <samanta@dkrz.de>
Date: Mon, 14 Apr 2025 14:56:35 +0200
Subject: [PATCH 32/34] made the way reference result is assigned

---
 test/c/test_interpolation_scalar.cpp | 84 ++++++----------------------
 test/c/test_intp_rbf.cpp             | 63 +++++----------------
 2 files changed, 30 insertions(+), 117 deletions(-)

diff --git a/test/c/test_interpolation_scalar.cpp b/test/c/test_interpolation_scalar.cpp
index c84f4fe..20ccf4f 100644
--- a/test/c/test_interpolation_scalar.cpp
+++ b/test/c/test_interpolation_scalar.cpp
@@ -209,17 +209,11 @@ TYPED_TEST(InterpolationScalarSingleParamTest, Verts2EdgesSpecific) {
   // Copy results back to host
   Kokkos::deep_copy(p_edge_out_h, this->p_edge_out);
 
-  std::cout << std::endl;
   // Expected results based on the specific test values
-  std::vector<TypeParam> expected_edges(12);
-  int idx = 0;
-  std::generate(expected_edges.begin(), expected_edges.end(), [&idx]() {
-    TypeParam values[] = {
+  std::vector<TypeParam> expected_edges = {
       1.505, 1.015, 1.605, 1.116, 1.705, 1.217,
       1.525, 1.0251, 1.626, 1.1271, 1.727, 1.2291
-    };
-    return values[idx++];
-  });
+  };
 
   // Verify results
   for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
@@ -421,15 +415,10 @@ TYPED_TEST(InterpolationScalarSingleParamTest, Edges2VertsSpecific) {
   Kokkos::deep_copy(p_vert_out_h, this->p_vert_out);
 
   // Expected results based on the specific test values
-  std::vector<TypeParam> expected_verts(12);
-  int idx = 0;
-  std::generate(expected_verts.begin(), expected_verts.end(), [&idx]() {
-    TypeParam values[] = {
+  std::vector<TypeParam> expected_verts = {
       1.7459, 1.7159, 1.8609, 1.8309, 1.9759, 1.9459,
       1.7456, 1.7156, 1.8606, 1.8306, 1.9756, 1.9456
-    };
-    return values[idx++];
-  });
+  };
 
   // Verify results
   for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
@@ -627,15 +616,10 @@ TYPED_TEST(InterpolationScalarSingleParamTest, Edges2CellsSpecific) {
   Kokkos::deep_copy(p_cell_out_h, this->p_cell_out);
 
   // Expected results based on the specific test values
-  std::vector<TypeParam> expected_cells(12);
-  int idx = 0;
-  std::generate(expected_cells.begin(), expected_cells.end(), [&idx]() {
-    TypeParam values[] = {
+  std::vector<TypeParam> expected_cells = {
       1.37677, 1.7201, 1.47977, 1.8231, 1.58277, 1.9261,
       1.3802, 1.72353, 1.4832, 1.82653, 1.5862, 1.92953
-    };
-    return values[idx++];
-  });
+  };
 
   // Verify results
   for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
@@ -833,15 +817,10 @@ TYPED_TEST(InterpolationScalarSingleParamTest, Verts2CellsSpecific) {
   Kokkos::deep_copy(p_cell_out_h, this->p_cell_out);
 
   // Expected results based on the specific test values
-  std::vector<TypeParam> expected_cells(12);
-  int idx = 0;
-  std::generate(expected_cells.begin(), expected_cells.end(), [&idx]() {
-    TypeParam values[] = {
+  std::vector<TypeParam> expected_cells = {
       1.37677, 1.7201, 1.47977, 1.8231, 1.58277, 1.9261,
       1.3802, 1.72353, 1.4832, 1.82653, 1.5862, 1.92953
-    };
-    return values[idx++];
-  });
+  };
 
   // Verify results - check the same ranges as in the expected calculation
   for (int jb = 0; jb < nblks_c; ++jb) {
@@ -1047,15 +1026,10 @@ TYPED_TEST(InterpolationScalarSingleParamTest, CellAvgLibSpecific) {
   Kokkos::deep_copy(avg_psi_c_h, this->avg_psi_c);
 
   // Expected results based on the specific test values
-  std::vector<TypeParam> expected_avg(12);
-  int idx = 0;
-  std::generate(expected_avg.begin(), expected_avg.end(), [&idx]() {
-    TypeParam values[] = {
+  std::vector<TypeParam> expected_avg = {
       1.402, 1.602, 1.502, 1.702, 1.602, 1.802,
       1.408, 1.608, 1.508, 1.708, 1.608, 1.808
-    };
-    return values[idx++];
-  });
+  };
 
   // Verify results
   for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
@@ -1324,15 +1298,10 @@ TYPED_TEST(InterpolationScalarDoubleParamTest, Cells2EdgesSpecific) {
   int i_endidx_range = this->i_endidx_in[1];
 
   // Expected results based on the specific test values
-  std::vector<OutType> expected_edges(12);
-  int idx = 0;
-  std::generate(expected_edges.begin(), expected_edges.end(), [&idx]() {
-    OutType values[] = {
+  std::vector<OutType> expected_edges = {
       1.505, 1.5149, 1.605, 1.6149, 1.705, 1.7149,
       1.505, 1.5151, 1.605, 1.6151, 1.705, 1.7151
-    };
-    return values[idx++];
-  });
+  };
 
   // Verify results
   for (int jb = i_startblk; jb <= i_endblk; ++jb) {
@@ -1560,15 +1529,10 @@ TYPED_TEST(InterpolationScalarDoubleParamTest, Cells2VertsSpecific) {
   Kokkos::deep_copy(p_vert_out_h, this->p_vert_out);
 
   // Expected results based on the specific test values
-  std::vector<OutType> expected_verts(12);
-  int idx = 0;
-  std::generate(expected_verts.begin(), expected_verts.end(), [&idx]() {
-    OutType values[] = {
+  std::vector<OutType> expected_verts = {
       1.7459, 1.7159, 1.8609, 1.8309, 1.9759, 1.9459,
       1.7456, 1.7156, 1.8606, 1.8306, 1.9756, 1.9456
-    };
-    return values[idx++];
-  });
+  };
 
   // Verify results
   for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
@@ -1833,10 +1797,7 @@ TYPED_TEST(Cells2vertsriScalarLibTestFixture, Cells2VertsRISpecific) {
   Kokkos::deep_copy(p_vert_out_h, this->p_vert_out);
 
   // Expected results based on the specific test values
-  std::vector<OutType> expected_verts(12);
-  int idx = 0;
-  std::generate(expected_verts.begin(), expected_verts.end(), [&idx]() {
-    OutType values[] = {
+  std::vector<OutType> expected_verts = {
 #ifdef __LOOP_EXCHANGE
       1.7459, 1.7159, 1.8609, 1.8309, 1.9759, 1.9459,
       1.7456, 1.7156, 1.8606, 1.8306, 1.9756, 1.9456
@@ -1844,20 +1805,7 @@ TYPED_TEST(Cells2vertsriScalarLibTestFixture, Cells2VertsRISpecific) {
       1.7459, 1.8609, 1.9759, 1.7159, 1.8309, 1.9459,
       1.7456, 1.8606, 1.9756, 1.7156, 1.8306, 1.9456
 #endif
-    };
-    return values[idx++];
-  });
-
-  std::cout << "p_vert_out_h: " << std::endl;
-  // print out the array p_vert_out_h in one line
-  for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
-    for (int jk = this->slev; jk <= this->elev; ++jk) {
-      for (int jv = this->i_startidx; jv <= this->i_endidx; ++jv) {
-        // std::cout << p_vert_out_h[vert_at(jk, jv, jb)] << ", ";
-        std::cout << p_vert_out_h[vert_at(jv, jk, jb)] << ", ";
-      }
-    }
-  }
+  };
 
   // Verify results - using the appropriate indexing depending on __LOOP_EXCHANGE
   for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
diff --git a/test/c/test_intp_rbf.cpp b/test/c/test_intp_rbf.cpp
index 49c7dec..a67480d 100644
--- a/test/c/test_intp_rbf.cpp
+++ b/test/c/test_intp_rbf.cpp
@@ -204,29 +204,19 @@ TYPED_TEST(RbfInterpolSingleParamTest, C2GradSpecific) {
   Kokkos::deep_copy(grad_y_h, this->grad_y);
 
   // Expected results based on the specific test values
-  std::vector<TypeParam> expected_grad_x(24);
-  std::vector<TypeParam> expected_grad_y(24);
-  int idx = 0;
-  std::generate(expected_grad_x.begin(), expected_grad_x.end(), [&idx]() {
-    TypeParam values[] = {
+  std::vector<TypeParam> expected_grad_x = {
       19.9225, 22.9275, 26.2225, 20.9675, 24.0725, 27.4675,
       22.0125, 25.2175, 28.7125, 23.0575, 26.3625, 29.9575,
       38.972, 42.977, 47.272, 41.017, 45.122, 49.517,
       43.062, 47.267, 51.762, 45.107, 49.412, 54.007
-    };
-    return values[idx++];
-  });
+  };
 
-  idx = 0;
-  std::generate(expected_grad_y.begin(), expected_grad_y.end(), [&idx]() {
-    TypeParam values[] = {
+  std::vector<TypeParam> expected_grad_y = {
       38.9725, 42.9775, 47.2725, 41.0175, 45.1225, 49.5175,
       43.0625, 47.2675, 51.7625, 45.1075, 49.4125, 54.0075,
       58.022, 63.027, 68.322, 61.067, 66.172, 71.567,
       64.112, 69.317, 74.812, 67.157, 72.462, 78.057 
-    };
-    return values[idx++];
-  });
+  };
 
   // Verify results
   for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
@@ -480,28 +470,18 @@ TYPED_TEST(RbfInterpolSingleParamTest, CellSpecific) {
   Kokkos::deep_copy(p_v_out_h, this->p_v_out);
 
   // Expected results based on the specific test values
-  std::vector<TypeParam> expected_u(24);
-  std::vector<TypeParam> expected_v(24);
-  int idx = 0;
-  std::generate(expected_u.begin(), expected_u.end(), [&idx]() {
-    TypeParam values[] = {
+  std::vector<TypeParam> expected_u = {
       18.8216, 20.5356, 22.3396, 19.7576, 21.5616, 23.4556,
       20.6936, 22.5876, 24.5716, 21.6296, 23.6136, 25.6876,
       36.882, 38.597, 40.402, 38.718, 40.523, 42.418,
       40.554, 42.449, 44.434, 42.39, 44.375, 46.45
-    };
-    return values[idx++];
-  });
-  idx = 0;
-  std::generate(expected_v.begin(), expected_v.end(), [&idx]() {
-    TypeParam values[] = {
+  };
+  std::vector<TypeParam> expected_v = {
       36.8616, 38.5756, 40.3796, 38.6976, 40.5016, 42.3956,
       40.5336, 42.4276, 44.4116, 42.3696, 44.3536, 46.4276,
       54.932, 56.647, 58.452, 57.668, 59.473, 61.368,
       60.404, 62.299, 64.284, 63.14, 65.125, 67.2
-    };
-    return values[idx++];
-  });
+  };
 
   // Verify results
   for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
@@ -741,17 +721,12 @@ TYPED_TEST(RbfInterpolSingleParamTest, EdgeSpecific) {
   Kokkos::deep_copy(p_vt_out_h, this->p_vt_out);
 
   // Expected results based on the specific test values
-  std::vector<TypeParam> expected_vt(24);
-  int idx = 0;
-  std::generate(expected_vt.begin(), expected_vt.end(), [&idx]() {
-    TypeParam values[] = {
+  std::vector<TypeParam> expected_vt = {
       7.1304, 8.9324, 10.9644, 7.5364, 9.3784, 11.4504,
       7.9424, 9.8244, 11.9364, 8.3484, 10.2704, 12.4224,
       14.1502, 16.9522, 19.9842, 14.9562, 17.7982, 20.8702,
       15.7622, 18.6442, 21.7562, 16.5682, 19.4902, 22.6422,
-    };
-    return values[idx++];
-  });
+  };
 
   // Verify results
   for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
@@ -1012,28 +987,18 @@ TYPED_TEST(RbfVecInterpolDoubleParamTest, VertexSpecific) {
   Kokkos::deep_copy(p_v_out_h, this->p_v_out);
 
   // Expected results based on the specific test values
-  std::vector<OutType> expected_u(24);
-  std::vector<OutType> expected_v(24);
-  int idx = 0;
-  std::generate(expected_u.begin(), expected_u.end(), [&idx]() {
-    OutType values[] = {
+  std::vector<OutType> expected_u = {
       12.3709, 13.5139, 14.7169, 12.9859, 14.1889, 15.4519,
       13.6009, 14.8639, 16.1869, 14.2159, 15.5389, 16.9219,
       24.4006, 25.5436, 26.7466, 25.6156, 26.8186, 28.0816,
       26.8306, 28.0936, 29.4166, 28.0456, 29.3686, 30.7516
-    };
-    return values[idx++];
-  });
-  idx = 0;
-  std::generate(expected_v.begin(), expected_v.end(), [&idx]() {
-    OutType values[] = {
+  };
+  std::vector<OutType> expected_v = {
       24.4009, 25.5439, 26.7469, 25.6159, 26.8189, 28.0819,
       26.8309, 28.0939, 29.4169, 28.0459, 29.3689, 30.7519,
       36.4306, 37.5736, 38.7766, 38.2456, 39.4486, 40.7116,
       40.0606, 41.3236, 42.6466, 41.8756, 43.1986, 44.5816
-    };
-    return values[idx++];
-  });
+  };
 
   // Verify results
   for (int jb = this->i_startblk; jb <= this->i_endblk; ++jb) {
-- 
GitLab


From 8e85077c588fe690ec7c25256cccbf5525acb465 Mon Sep 17 00:00:00 2001
From: Pradipta Samanta <samanta@dkrz.de>
Date: Mon, 14 Apr 2025 15:04:37 +0200
Subject: [PATCH 33/34] made a small change

---
 test/c/test_horizontal_div.cpp | 12 ++++--------
 test/c/test_horizontal_rot.cpp | 12 ++++--------
 2 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/test/c/test_horizontal_div.cpp b/test/c/test_horizontal_div.cpp
index 9f7f4c0..ef95f6b 100644
--- a/test/c/test_horizontal_div.cpp
+++ b/test/c/test_horizontal_div.cpp
@@ -58,7 +58,9 @@ protected:
   Kokkos::View<ValueType *, memory_space> opt_out2;
 
   HorizontalDivTest()
-      : vec_e("vec_e", dim_combine(nproma, nlev, nblks_e)),
+      : slev(dim4d, 0),
+        elev(dim4d, nlev - 1), // Full vertical range (0 .. nlev-1)
+        vec_e("vec_e", dim_combine(nproma, nlev, nblks_e)),
         cell_edge_idx("cell_edge_idx", dim_combine(nproma, nblks_c, 3)),
         cell_edge_blk("cell_edge_blk", dim_combine(nproma, nblks_c, 3)),
         geofac_div("geofac_div", dim_combine(nproma, 3, nblks_c)),
@@ -69,13 +71,7 @@ protected:
         cell_neighbor_blk("cell_neighbor_blk", dim_combine(nproma, nblks_c, 3)),
         avg_coeff("avg_coeff", dim_combine(nproma, 4, nblks_c)),
         opt_in2("opt_in2", dim_combine(nproma, nlev, nblks_e)),
-        opt_out2("opt_out2", dim_combine(nproma, nlev, nblks_c)) {
-
-    // We keep slev and elev as std::vector since they are small and used only
-    // on the host.
-    slev.resize(dim4d, 0);
-    elev.resize(dim4d, nlev - 1); // Full vertical range (0 .. nlev-1)
-  }
+        opt_out2("opt_out2", dim_combine(nproma, nlev, nblks_c)) {}
 };
 
 /// ValueTypes which the divrot tests should run with
diff --git a/test/c/test_horizontal_rot.cpp b/test/c/test_horizontal_rot.cpp
index f93cddd..ca675a4 100644
--- a/test/c/test_horizontal_rot.cpp
+++ b/test/c/test_horizontal_rot.cpp
@@ -51,19 +51,15 @@ protected:
   Kokkos::View<ValueType *, memory_space> f4dout;
 
   HorizontalRotVertexTest()
-      : vec_e("vec_e", dim_combine(nproma, nlev, nblks_e)),
+      : slev(dim4d, 0),
+        elev(dim4d, nlev - 1), // Full vertical range (0 .. nlev-1)
+        vec_e("vec_e", dim_combine(nproma, nlev, nblks_e)),
         vert_edge_idx("vert_edge_idx", dim_combine(nproma, nblks_v, 6)),
         vert_edge_blk("vert_edge_blk", dim_combine(nproma, nblks_v, 6)),
         geofac_rot("geofac_rot", dim_combine(nproma, 6, nblks_v)),
         rot_vec("rot_vec", dim_combine(nproma, nlev, nblks_v)),
         f4din("f4din", dim_combine(nproma, nlev, nblks_e, dim4d)),
-        f4dout("f4dout", dim_combine(nproma, nlev, nblks_v, dim4d)) {
-
-    // We keep slev and elev as std::vector since they are small and used only
-    // on the host.
-    slev.resize(dim4d, 0);
-    elev.resize(dim4d, nlev - 1); // Full vertical range (0 .. nlev-1)
-  }
+        f4dout("f4dout", dim_combine(nproma, nlev, nblks_v, dim4d)) {}
 };
 
 /// ValueTypes which the divrot tests should run with
-- 
GitLab


From a32290547ad2f0d4cad1b19710f6a3e3acba5a50 Mon Sep 17 00:00:00 2001
From: Pradipta Samanta <samanta@dkrz.de>
Date: Tue, 15 Apr 2025 19:09:06 +0200
Subject: [PATCH 34/34] changed the time for gitlab-ci

---
 .gitlab-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 6877a94..5109bb5 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -17,7 +17,7 @@ stages:
 variables:
   ACCOUNT_CPU: "ka1125"
   ACCOUNT_GPU: "bk1341"
-  SLURM_OPTIONS_CPU: "--account=$ACCOUNT_CPU --partition=shared"
+  SLURM_OPTIONS_CPU: "--account=$ACCOUNT_CPU --partition=shared --time=00:10:00"
   SLURM_OPTIONS_GPU: "--account=$ACCOUNT_GPU --partition=gpu --gpus=1"
   SLURM_NTASKS: "--ntasks=1"
   GIT_CONFIG_COUNT: 1
-- 
GitLab