diff --git a/CMakeLists.txt b/CMakeLists.txt
index 413f35f3c197d1aeb0e4309de3e144656daf05cd..c4a491034ef4877dbb325ab219fc83f153d3e242 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,6 +12,8 @@ endif ()
 # if using kokkos as shared library, -fPIC is needed
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
 
+set(Kokkos_ENABLE_IMPL_MDSPAN OFF CACHE BOOL "Experimental mdspan support")
+
 # configure kokkos 4.2 repository link
 FetchContent_Declare(kokkos
         URL https://github.com/kokkos/kokkos/releases/download/4.4.01/kokkos-4.4.01.tar.gz)
diff --git a/main.cpp b/main.cpp
index 2b5bbd12e9afdf7c33a891d9a06acbace836e4b5..bf48c9114770f370be23bb2286237d46404db8ee 100644
--- a/main.cpp
+++ b/main.cpp
@@ -1,272 +1,341 @@
-#include <iostream>
 #include <Kokkos_Core.hpp>
-#include "Kokkos_Timer.hpp"
 #include <cassert>
+#include <iostream>
+#include <utility>
+
+#include "Kokkos_Timer.hpp"
 
 using space_t = Kokkos::DefaultExecutionSpace::memory_space;
 typedef Kokkos::MDRangePolicy<Kokkos::DefaultExecutionSpace, Kokkos::IndexType<int>, Kokkos::Rank<3>> md_range_policy;
 
- Kokkos::Timer timer;
-   
+Kokkos::Timer timer;
 
-//constexpr int nblocks = 2;
-//constexpr int nlev = 90;
-//constexpr int nproma = 55000;
+// constexpr int nblocks = 2;
+// constexpr int nlev = 90;
+// constexpr int nproma = 55000;
 
-//#define ENABLE_CHECK_BOUNDS
+// #define ENABLE_CHECK_BOUNDS
 
 static void validate(double* array, int nblocks, int nlev, int nproma) {
-    for (int i = 0; i < nblocks * nlev * nproma; ++i)  {
-       assert(array[i] == static_cast<double>(i));
-    }
+  for (int i = 0; i < nblocks * nlev * nproma; ++i) {
+    assert(array[i] == static_cast<double>(i));
+  }
 }
 
 inline void check_bounds(int i1, int i2, int i3, int n1, int n2, int n3) {
 #ifdef ENABLE_CHECK_BOUNDS
-  assert(i1 >=0 && i2 >= 0 && i3 >= 0 &&
-         i1 < n1 && i2 < n2 && i3 < n3);
+  assert(i1 >= 0 && i2 >= 0 && i3 >= 0 && i1 < n1 && i2 < n2 && i3 < n3);
 #endif
 }
 
-void scenario_1(double* array, int nblocks, int nlev, int nproma, bool print=true) {
-    if(print)
-        std::cout << "Default layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- " << std::endl;
+void scenario_1(double* array, int nblocks, int nlev, int nproma, bool print = true) {
+  if (print)
+    std::cout << "scenario 1: Default layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- "
+              << std::endl;
 
-    Kokkos::View<double***, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nblocks, nlev, nproma);
+  Kokkos::View<double***, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nblocks, nlev, nproma);
 
-    using space_t = Kokkos::DefaultExecutionSpace::memory_space;
-    auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view);
+  using space_t = Kokkos::DefaultExecutionSpace::memory_space;
+  auto d_view   = Kokkos::create_mirror_view_and_copy(space_t(), view);
 
-    timer.reset();
+  timer.reset();
 
-    for (int jb = 0 ; jb < nblocks; ++jb) 
-    Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) {
-        for (int jk = 0; jk < nlev; ++jk) {
-            int p = jb * nlev * nproma + jk * nproma + jc;  
-            check_bounds(jb,jk,jc, d_view.extent(0), d_view.extent(1), d_view.extent(2));
+  for (int jb = 0; jb < nblocks; ++jb)
+    Kokkos::parallel_for(
+        "", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA(const int jc) {
+          for (int jk = 0; jk < nlev; ++jk) {
+            int p = jb * nlev * nproma + jk * nproma + jc;
+            check_bounds(jb, jk, jc, d_view.extent(0), d_view.extent(1), d_view.extent(2));
             d_view(jb, jk, jc) = p;
-        }});
-    
-    Kokkos::fence();
-    if(print)
-        printf("Time = %f ms\n\n", timer.seconds() * 1000);
-    Kokkos::deep_copy(view, d_view);
+          }
+        });
+  Kokkos::fence();
+
+  Kokkos::fence();
+  if (print) printf("Time = %f ms\n\n", timer.seconds() * 1000);
+  Kokkos::deep_copy(view, d_view);
 
-    validate(array, nblocks, nlev, nproma);
+  validate(array, nblocks, nlev, nproma);
 }
 
-void scenario_2(double* array, int nblocks, int nlev, int nproma, bool print=true) {
-    if(print)
-        std::cout << "Right layout; view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- " << std::endl;
+void scenario_2(double* array, int nblocks, int nlev, int nproma, bool print = true) {
+  if (print)
+    std::cout << "scenario 2: Right layout; view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- " << std::endl;
 
-    Kokkos::View<double***, Kokkos::LayoutRight, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks);
- 
-    using space_t = Kokkos::DefaultExecutionSpace::memory_space;
-    auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view);
-    
-    timer.reset();
+  Kokkos::View<double***, Kokkos::LayoutRight, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev,
+                                                                                                nblocks);
 
-    for (int jb = 0 ; jb < nblocks; ++jb) 
-    Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) {
-        for (int jk = 0; jk < nlev; ++jk) {
+  using space_t = Kokkos::DefaultExecutionSpace::memory_space;
+  auto d_view   = Kokkos::create_mirror_view_and_copy(space_t(), view);
+
+  timer.reset();
+
+  for (int jb = 0; jb < nblocks; ++jb)
+    Kokkos::parallel_for(
+        "", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA(const int jc) {
+          for (int jk = 0; jk < nlev; ++jk) {
             int p = jc * nlev * nblocks + jk * nblocks + jb;
-            check_bounds(jc,jk,jb, d_view.extent(0), d_view.extent(1), d_view.extent(2));
+            check_bounds(jc, jk, jb, d_view.extent(0), d_view.extent(1), d_view.extent(2));
             d_view(jc, jk, jb) = p;
 
-          //  printf("%f ", d_view(jb, jk, jc));      
-        }});
-    
-    Kokkos::fence();
-    if(print)
-        printf("Time = %f ms\n\n", timer.seconds() * 1000);
-    Kokkos::deep_copy(view, d_view);
-    validate(array, nblocks, nlev, nproma);
+            //  printf("%f ", d_view(jb, jk, jc));
+          }
+        });
 
+  Kokkos::fence();
+  if (print) printf("Time = %f ms\n\n", timer.seconds() * 1000);
+  Kokkos::deep_copy(view, d_view);
+  validate(array, nblocks, nlev, nproma);
 }
 
-void scenario_2b(double* array, int nblocks, int nlev, int nproma, bool print=true) {
-    if(print)
-        std::cout << "Right 2b layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- " << std::endl;
+void scenario_2b(double* array, int nblocks, int nlev, int nproma, bool print = true) {
+  if (print)
+    std::cout << "scenario 2b: Right 2b layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- "
+              << std::endl;
 
-    Kokkos::View<double***, Kokkos::LayoutRight, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nblocks, nlev, nproma);
- 
-    using space_t = Kokkos::DefaultExecutionSpace::memory_space;
-    auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view);
-    
-    timer.reset();
+  Kokkos::View<double***, Kokkos::LayoutRight, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nblocks, nlev,
+                                                                                                nproma);
 
-    for (int jb = 0 ; jb < nblocks; ++jb) 
-    Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) {
-        for (int jk = 0; jk < nlev; ++jk) {
-            int p = jb * nlev * nproma + jk * nproma + jc;  
-            check_bounds(jb,jk,jc, d_view.extent(0), d_view.extent(1), d_view.extent(2));
+  using space_t = Kokkos::DefaultExecutionSpace::memory_space;
+  auto d_view   = Kokkos::create_mirror_view_and_copy(space_t(), view);
+
+  timer.reset();
+
+  for (int jb = 0; jb < nblocks; ++jb)
+    Kokkos::parallel_for(
+        "", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA(const int jc) {
+          for (int jk = 0; jk < nlev; ++jk) {
+            int p = jb * nlev * nproma + jk * nproma + jc;
+            check_bounds(jb, jk, jc, d_view.extent(0), d_view.extent(1), d_view.extent(2));
             d_view(jb, jk, jc) = p;
 
-          //  printf("%f ", d_view(jb, jk, jc));      
-        }});
-    
-    Kokkos::fence();    
-    if(print)
-        printf("Time = %f ms\n\n", timer.seconds() * 1000);
-    Kokkos::deep_copy(view, d_view);
-    validate(array, nblocks, nlev, nproma);
+            //  printf("%f ", d_view(jb, jk, jc));
+          }
+        });
 
+  Kokkos::fence();
+  if (print) printf("Time = %f ms\n\n", timer.seconds() * 1000);
+  Kokkos::deep_copy(view, d_view);
+  validate(array, nblocks, nlev, nproma);
 }
 
 // slow on CPU
-void scenario_3(double* array, int nblocks, int nlev, int nproma, bool print=true) {
-
-    if(print)
-        std::cout << "Left layout; view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- " << std::endl;
-
-    Kokkos::View<double***, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks);
- 
-    using space_t = Kokkos::DefaultExecutionSpace::memory_space;
-    auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view);
-    
-    timer.reset();
-    for (int jb = 0 ; jb < nblocks; ++jb) 
-    Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) {
-        for (int jk = 0; jk < nlev; ++jk) {
-            int p = jb * nlev * nproma + jk * nproma + jc;  
-            check_bounds(jc,jk,jb, d_view.extent(0), d_view.extent(1), d_view.extent(2));
+void scenario_3(double* array, int nblocks, int nlev, int nproma, bool print = true) {
+  if (print)
+    std::cout << "scenario 3: Left layout; view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- " << std::endl;
+
+  Kokkos::View<double***, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev,
+                                                                                               nblocks);
+
+  using space_t = Kokkos::DefaultExecutionSpace::memory_space;
+  auto d_view   = Kokkos::create_mirror_view_and_copy(space_t(), view);
+
+  timer.reset();
+  for (int jb = 0; jb < nblocks; ++jb)
+    Kokkos::parallel_for(
+        "", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA(const int jc) {
+          for (int jk = 0; jk < nlev; ++jk) {
+            int p = jb * nlev * nproma + jk * nproma + jc;
+            check_bounds(jc, jk, jb, d_view.extent(0), d_view.extent(1), d_view.extent(2));
             d_view(jc, jk, jb) = p;
 
-          //  printf("%f ", d_view(jb, jk, jc));      
-        }});
-    
-    Kokkos::fence();
-    if(print)
-        printf("Time = %f ms\n\n", timer.seconds() * 1000);
-    Kokkos::deep_copy(view, d_view);
-    validate(array, nblocks, nlev, nproma);
+            //  printf("%f ", d_view(jb, jk, jc));
+          }
+        });
 
+  Kokkos::fence();
+  if (print) printf("Time = %f ms\n\n", timer.seconds() * 1000);
+  Kokkos::deep_copy(view, d_view);
+  validate(array, nblocks, nlev, nproma);
 }
 
-void scenario_4(double* array, int nblocks, int nlev, int nproma, bool print=true) {
+void scenario_4(double* array, int nblocks, int nlev, int nproma, bool print = true) {
+  if (print)
+    std::cout << "scenario 4: Default layout; view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- "
+              << std::endl;
 
-    if(print)
-        std::cout << "Default layout; view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- " << std::endl;
+  Kokkos::View<double***, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks);
 
-    Kokkos::View<double***, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks);
- 
-    using space_t = Kokkos::DefaultExecutionSpace::memory_space;
-    auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view);
-    
-    timer.reset();
-    for (int jb = 0 ; jb < nblocks; ++jb) 
-    Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) {
-        for (int jk = 0; jk < nlev; ++jk) {
+  using space_t = Kokkos::DefaultExecutionSpace::memory_space;
+  auto d_view   = Kokkos::create_mirror_view_and_copy(space_t(), view);
+
+  timer.reset();
+  for (int jb = 0; jb < nblocks; ++jb)
+    Kokkos::parallel_for(
+        "", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA(const int jc) {
+          for (int jk = 0; jk < nlev; ++jk) {
             int p = jc * nlev * nblocks + jk * nblocks + jb;
-            check_bounds(jc,jk,jb, d_view.extent(0), d_view.extent(1), d_view.extent(2));
+            check_bounds(jc, jk, jb, d_view.extent(0), d_view.extent(1), d_view.extent(2));
             d_view(jc, jk, jb) = p;
 
-          //  printf("%f ", d_view(jb, jk, jc));      
-        }});
+            //  printf("%f ", d_view(jb, jk, jc));
+          }
+        });
+  Kokkos::fence();
+  if (print) printf("Time = %f ms\n\n", timer.seconds() * 1000);
+  Kokkos::deep_copy(view, d_view);
+  validate(array, nblocks, nlev, nproma);
+}
+
+void scenario_4b(double* array, int nblocks, int nlev, int nproma, bool print = true) {
+  if (print)
+    std::cout << "scenario 4b (Dmitry's solution): view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- "
+              << std::endl;
+
+  Kokkos::View<double***, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks);
+
+  using space_t   = Kokkos::DefaultExecutionSpace::memory_space;
+  auto d_view_tmp = Kokkos::create_mirror_view_and_copy(space_t(), view);
+  Kokkos::View<double***, space_t> d_view("d_view", nproma, nlev, nblocks);
+  Kokkos::deep_copy(d_view_tmp, view);
+  Kokkos::deep_copy(d_view, d_view_tmp);
+
+  timer.reset();
+  for (int jb = 0; jb < nblocks; ++jb)
+    Kokkos::parallel_for(
+        "", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA(const int jc) {
+          for (int jk = 0; jk < nlev; ++jk) {
+            int p              = jc * nlev * nblocks + jk * nblocks + jb;
+            d_view(jc, jk, jb) = p;
 
-    Kokkos::fence();
-    if(print)
-        printf("Time = %f ms\n\n", timer.seconds() * 1000);
-    Kokkos::deep_copy(view, d_view);
-    validate(array, nblocks, nlev, nproma);
+            //  printf("%f ", d_view(jb, jk, jc));
+          }
+        });
 
+  Kokkos::fence();
+  if (print) printf("Time = %f ms\n\n", timer.seconds() * 1000);
+  Kokkos::deep_copy(d_view_tmp, d_view);
+  Kokkos::deep_copy(view, d_view_tmp);
+  validate(array, nblocks, nlev, nproma);
 }
 
+#if defined(KOKKOS_ENABLE_CUDA) or defined(KOKKOS_ENABLE_HIP)
+#define gpu 1
+using Layout = Kokkos::LayoutLeft;
+#else
+#undef gpu
+using Layout = Kokkos::LayoutRight;
+#endif
 
-void scenario_5(double* array, int nblocks, int nlev, int nproma, bool print=true) {
+template <class T> struct LoopFunctor {
+  Kokkos::View<T**, Kokkos::LayoutStride> view;
+  int nproma, nlev, nblocks, jb;
+
+  LoopFunctor(int nproma, int nlev, int nblocks, int jb, Kokkos::View<T***, Kokkos::LayoutStride> d_view)
+      : nproma(nproma), nlev(nlev), nblocks(nblocks), jb(jb) {
+    view = Kokkos::subview(d_view, Kokkos::ALL, Kokkos::ALL, jb);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int jc) const {
+    for (int jk = 0; jk < nlev; ++jk) {
+#if defined(gpu)
+      int p = jb * nlev * nproma + jk * nproma + jc;
+#else
+      int p = jc * nlev * nblocks + jk * nblocks + jb;
+#endif
+      view(jc, jk) = p;
+    }
+  }
+};
 
-    if(print)
-        std::cout << "Default layout; view(array, nproma, nblocks, nlev); d_view(jc, jb, jk) ----- KOMISCH" << std::endl;
+void scenario_5(double* array, int nblocks, int nlev, int nproma, bool print = true) {
+  if (print) std::cout << "scenario 5: Adaptable layout & functor & subview (array, nproma, nlev);" << std::endl;
 
-    Kokkos::View<double***, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nblocks, nlev);
- 
-    using space_t = Kokkos::DefaultExecutionSpace::memory_space;
-    auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view);
-    
-    timer.reset();
-    for (int jb = 0 ; jb < nblocks; ++jb) 
-    Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) {
-        for (int jk = 0; jk < nlev; ++jk) {
-            int p = jc * nlev * nblocks + jb * nlev + jk;
-            d_view(jc, jb, jk) = p;
+  Kokkos::View<double***, Layout, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks);
 
-          //  printf("%f ", d_view(jb, jk, jc));      
-        }});
+  using space_t = Kokkos::DefaultExecutionSpace::memory_space;
+  auto d_view   = Kokkos::create_mirror_view_and_copy(space_t(), view);
 
-    Kokkos::fence();
-    if(print)
-        printf("Time = %f ms\n\n", timer.seconds() * 1000);
-    Kokkos::deep_copy(view, d_view);
-    validate(array, nblocks, nlev, nproma);
+  timer.reset();
+  for (int jb = 0; jb < nblocks; ++jb)
+    Kokkos::parallel_for(
+        "", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA(const int jc) {
+          for (int jk = 0; jk < nlev; ++jk) {
+            int p              = jc * nlev * nblocks + jb * nlev + jk;
+            d_view(jc, jb, jk) = p;
 
+            //  printf("%f ", d_view(jb, jk, jc));
+          }
+        });
+
+  Kokkos::fence();
+  if (print) printf("Time = %f ms\n\n", timer.seconds() * 1000);
+  Kokkos::deep_copy(view, d_view);
+  validate(array, nblocks, nlev, nproma);
 }
 
+void scenario_6(double* array, int nblocks, int nlev, int nproma, bool print = true) {
+  if (print) std::cout << "scenario 6: Adaptable Layout &  Hierarchical parallelism" << std::endl;
 
-void scenario_6(double* array, int nblocks, int nlev, int nproma, bool print=true) {
+  Kokkos::View<double***, Layout, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks);
 
-    if(print)
-        std::cout << "Default layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- " << std::endl;
+  using space_t     = Kokkos::DefaultExecutionSpace::memory_space;
+  auto d_view       = Kokkos::create_mirror_view_and_copy(space_t(), view);
 
-    Kokkos::View<double***, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nblocks, nlev, nproma);
- 
-    using space_t = Kokkos::DefaultExecutionSpace::memory_space;
-    auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view);
-    
-    timer.reset();
-    Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nblocks), KOKKOS_LAMBDA (const int jb) {
-        //for (int jb = 0 ; jb < nblocks; ++jb) {
-      for (int jc = 0 ; jc < nproma; ++jc) {
-        for (int jk = 0; jk < nlev; ++jk) {
+  using team_policy = Kokkos::TeamPolicy<>;
+  using member_type = Kokkos::TeamPolicy<>::member_type;
+
+  timer.reset();
+
+  Kokkos::parallel_for(
+      "blocks", team_policy(nblocks, Kokkos::AUTO), KOKKOS_LAMBDA(const member_type& teamMember) {
+        const int jb = teamMember.league_rank();
+
+        Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, nproma), [&](const int jc) {
+          // sequential over the levels
+          for (int jk = 0; jk < nlev; ++jk) {
+#if defined(gpu)
             int p = jb * nlev * nproma + jk * nproma + jc;
-            check_bounds(jb,jk,jc, d_view.extent(0), d_view.extent(1), d_view.extent(2));
-            d_view(jb, jk, jc) = p;
-            //  printf("%f ", d_view(jb, jk, jc));      
+#else 
+                            int p = jc * nlev * nblocks + jk * nblocks + jb;
+#endif
+
+            d_view(jc, jk, jb) = p;
           }
-        }
+        });
       });
 
-    if(print)
-        printf("Time = %f ms\n\n", timer.seconds() * 1000);
-    Kokkos::deep_copy(view, d_view);
-    validate(array, nblocks, nlev, nproma);
-}
+  Kokkos::fence();
 
+  if (print) printf("Time = %f ms\n\n", timer.seconds() * 1000);
+  Kokkos::deep_copy(view, d_view);
+  validate(array, nblocks, nlev, nproma);
+}
 
 int main() {
+  int ncells  = atoi(std::getenv("NCELLS"));
+  int nlev    = atoi(std::getenv("NLEV"));
+  int nproma  = atoi(std::getenv("NPROMA"));
+  int nblocks = (ncells - 1) / nproma + 1;
+
+  std::cout << "nblocks=" << nblocks << ", nlev=" << nlev << ", nproma=" << nproma << std::endl;
+
+  double array[nblocks * nlev * nproma];
+  /*
+     for (int jb = 0; jb < nblocks; ++jb)
+         for (int jk = 0; jk < nlev; ++jk)
+             for (int jc = 0; jc < nproma; ++jc) {
+                 int p = jb * nlev * nproma + jk * nproma + jc;
+                 array[p] = 1; //static_cast<double>(p);
+             }
+ */
+  /*
+      for (int i = 0; i < nblocks * nlev * nproma; ++i)
+          std::cout << array[i] << " " ;
+      std::cout << "\n";
+
+
+     for (int jb = 0; jb < nblocks; ++jb)
+          for (int jk = 0; jk < nlev; ++jk){
+              for (int jc = 0; jc < nproma; ++jc)
+                  std::cout << view(jb, jk, jc)<< " ";
+            std::cout << "\n";
+          }
+  */
 
-    int ncells = atoi(std::getenv("NCELLS"));
-    int nlev = atoi(std::getenv("NLEV"));
-    int nproma = atoi(std::getenv("NPROMA"));
-    int nblocks = (ncells - 1) / nproma + 1;
-
-    std::cout << "nblocks=" << nblocks << ", nlev=" << nlev << ", nproma=" << nproma << std::endl;
-
-    double array[nblocks * nlev * nproma];
- /*   
-    for (int jb = 0; jb < nblocks; ++jb)
-        for (int jk = 0; jk < nlev; ++jk)
-            for (int jc = 0; jc < nproma; ++jc) {
-                int p = jb * nlev * nproma + jk * nproma + jc;
-                array[p] = 1; //static_cast<double>(p);
-            }
-*/
-/*
-    for (int i = 0; i < nblocks * nlev * nproma; ++i) 
-        std::cout << array[i] << " " ;
-    std::cout << "\n";     
-
-
-   for (int jb = 0; jb < nblocks; ++jb)
-        for (int jk = 0; jk < nlev; ++jk){
-            for (int jc = 0; jc < nproma; ++jc) 
-                std::cout << view(jb, jk, jc)<< " ";
-          std::cout << "\n";
-        }
-*/
-
-    Kokkos::initialize();
-{
-
+  Kokkos::initialize();
+  {
     scenario_1(array, nblocks, nlev, nproma, false);
 
     scenario_1(array, nblocks, nlev, nproma);
@@ -274,28 +343,28 @@ int main() {
     scenario_2b(array, nblocks, nlev, nproma);
     scenario_3(array, nblocks, nlev, nproma);
     scenario_4(array, nblocks, nlev, nproma);
+    scenario_4b(array, nblocks, nlev, nproma);
     scenario_5(array, nblocks, nlev, nproma);
     scenario_6(array, nblocks, nlev, nproma);
+  }
+  Kokkos::finalize();
 
-}
-    Kokkos::finalize();
-
-    return 0;
+  return 0;
 }
 
 /**
- * 
+ *
  * #if 0
     Kokkos::parallel_for(
       "print", md_range_policy({0, 0, 0}, {nblocks, nlev, nproma}),
       KOKKOS_LAMBDA(const int jb, const int jk, const int jc) {
             int p = jb * nlev * nproma + jk * nproma + jc;
             d_view(jb, jk, jc) += p;
-            printf("%f ", d_view(jb, jk, jc));        
+            printf("%f ", d_view(jb, jk, jc));
     });
     std::cout << "\n";
 #endif
-    for (int jb = 0 ; jb < nblocks; ++jb) 
+    for (int jb = 0 ; jb < nblocks; ++jb)
     Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) {
         for (int jk = 0; jk < nlev; ++jk) {
           //  int p = jb * nlev * nproma + jk * nproma + jc;  left
@@ -303,6 +372,6 @@ int main() {
            // d_view(jb, jk, jc) = p;
             d_view(jc, jk, jb) = p;
 
-          //  printf("%f ", d_view(jb, jk, jc));      
+          //  printf("%f ", d_view(jb, jk, jc));
         }});
 */
diff --git a/script.sh b/script.sh
index 43ae273b91817ca78789ca75d158fd92a368a908..d97fd6a3a2b24e78ce29b017dc6c6b3b809a898e 100755
--- a/script.sh
+++ b/script.sh
@@ -8,17 +8,21 @@ ulimit -s unlimited
 
 if [ "$1" == 'gpu' ]
 then
+    rm -rf build_gpu
     cmake -B build_gpu -S . -DMU_ARCH=a100 -DCMAKE_CXX_FLAGS="-O3"
     cmake --build build_gpu --parallel 
+
     ncells=(5000000)
     nlev=(90)
     nproma=(5000000)
 else 
+    rm -rf build
     cmake -B build -S . -DMU_ARCH=x86_64 -DCMAKE_CXX_FLAGS="-O3"
     cmake --build build --parallel
+
     ncells=(5000000)
     nlev=(90)   
-    nproma=(32) # 64 96 128)
+    nproma=(32 64 96 128)
 
     export OMP_PROC_BIND=close
     export OMP_PLACES=cores