From 1a2596ea481a212a14cc19f92fffb50c7e1e8f5a Mon Sep 17 00:00:00 2001
From: Georgiana Mania <mania@dkrz.de>
Date: Wed, 26 Feb 2025 15:10:55 +0100
Subject: [PATCH 1/6] add kokkos::fence() as recommended by Sergey/Dmitry

---
 main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.cpp b/main.cpp
index 2b5bbd1..825a782 100644
--- a/main.cpp
+++ b/main.cpp
@@ -46,6 +46,7 @@ void scenario_1(double* array, int nblocks, int nlev, int nproma, bool print=tru
             check_bounds(jb,jk,jc, d_view.extent(0), d_view.extent(1), d_view.extent(2));
             d_view(jb, jk, jc) = p;
         }});
+    Kokkos::fence();
     
     Kokkos::fence();
     if(print)
@@ -163,7 +164,6 @@ void scenario_4(double* array, int nblocks, int nlev, int nproma, bool print=tru
 
           //  printf("%f ", d_view(jb, jk, jc));      
         }});
-
     Kokkos::fence();
     if(print)
         printf("Time = %f ms\n\n", timer.seconds() * 1000);
-- 
GitLab


From 2b768d093d39c92c7d54653b8c186859da3f2c9d Mon Sep 17 00:00:00 2001
From: Georgiana Mania <mania@dkrz.de>
Date: Thu, 27 Feb 2025 10:51:39 +0100
Subject: [PATCH 2/6] add scenario 5; fix wrong prints

---
 main.cpp | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/main.cpp b/main.cpp
index 825a782..0ec21ea 100644
--- a/main.cpp
+++ b/main.cpp
@@ -1,4 +1,5 @@
 #include <iostream>
+#include <utility>
 #include <Kokkos_Core.hpp>
 #include "Kokkos_Timer.hpp"
 #include <cassert>
@@ -30,7 +31,7 @@ inline void check_bounds(int i1, int i2, int i3, int n1, int n2, int n3) {
 
 void scenario_1(double* array, int nblocks, int nlev, int nproma, bool print=true) {
     if(print)
-        std::cout << "Default layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- " << std::endl;
+        std::cout << "scenario 1: Default layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- " << std::endl;
 
     Kokkos::View<double***, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nblocks, nlev, nproma);
 
@@ -58,7 +59,11 @@ void scenario_1(double* array, int nblocks, int nlev, int nproma, bool print=tru
 
 void scenario_2(double* array, int nblocks, int nlev, int nproma, bool print=true) {
     if(print)
+<<<<<<< HEAD
         std::cout << "Right layout; view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- " << std::endl;
+=======
+        std::cout << "scenario 2: Right layout; view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- " << std::endl;
+>>>>>>> cb80ea4 (add scenario 5; fix wrong prints)
 
     Kokkos::View<double***, Kokkos::LayoutRight, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks);
  
@@ -87,7 +92,7 @@ void scenario_2(double* array, int nblocks, int nlev, int nproma, bool print=tru
 
 void scenario_2b(double* array, int nblocks, int nlev, int nproma, bool print=true) {
     if(print)
-        std::cout << "Right 2b layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- " << std::endl;
+        std::cout << "scenario 2b: Right 2b layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- " << std::endl;
 
     Kokkos::View<double***, Kokkos::LayoutRight, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nblocks, nlev, nproma);
  
@@ -118,7 +123,7 @@ void scenario_2b(double* array, int nblocks, int nlev, int nproma, bool print=tr
 void scenario_3(double* array, int nblocks, int nlev, int nproma, bool print=true) {
 
     if(print)
-        std::cout << "Left layout; view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- " << std::endl;
+        std::cout << "scenario 3: Left layout; view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- " << std::endl;
 
     Kokkos::View<double***, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks);
  
@@ -147,7 +152,7 @@ void scenario_3(double* array, int nblocks, int nlev, int nproma, bool print=tru
 void scenario_4(double* array, int nblocks, int nlev, int nproma, bool print=true) {
 
     if(print)
-        std::cout << "Default layout; view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- " << std::endl;
+        std::cout << "scenario 4: Default layout; view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- " << std::endl;
 
     Kokkos::View<double***, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks);
  
-- 
GitLab


From 70ec1707edc6daf67e04794b99244b8c23585de8 Mon Sep 17 00:00:00 2001
From: Georgiana Mania <mania@dkrz.de>
Date: Thu, 27 Feb 2025 11:40:54 +0100
Subject: [PATCH 3/6] fix kokkos gpu compile warning

---
 CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 413f35f..c4a4910 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,6 +12,8 @@ endif ()
 # if using kokkos as shared library, -fPIC is needed
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
 
+set(Kokkos_ENABLE_IMPL_MDSPAN OFF CACHE BOOL "Experimental mdspan support")
+
 # configure kokkos 4.2 repository link
 FetchContent_Declare(kokkos
         URL https://github.com/kokkos/kokkos/releases/download/4.4.01/kokkos-4.4.01.tar.gz)
-- 
GitLab


From 0ca092d66205555ebcf63ffc83aaf346fb8db0ab Mon Sep 17 00:00:00 2001
From: Georgiana Mania <mania@dkrz.de>
Date: Thu, 27 Feb 2025 11:41:12 +0100
Subject: [PATCH 4/6] add scenario 6

---
 main.cpp  | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
 script.sh |  6 ++++-
 2 files changed, 79 insertions(+), 8 deletions(-)

diff --git a/main.cpp b/main.cpp
index 0ec21ea..7d1711c 100644
--- a/main.cpp
+++ b/main.cpp
@@ -59,11 +59,7 @@ void scenario_1(double* array, int nblocks, int nlev, int nproma, bool print=tru
 
 void scenario_2(double* array, int nblocks, int nlev, int nproma, bool print=true) {
     if(print)
-<<<<<<< HEAD
-        std::cout << "Right layout; view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- " << std::endl;
-=======
         std::cout << "scenario 2: Right layout; view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- " << std::endl;
->>>>>>> cb80ea4 (add scenario 5; fix wrong prints)
 
     Kokkos::View<double***, Kokkos::LayoutRight, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks);
  
@@ -177,13 +173,41 @@ void scenario_4(double* array, int nblocks, int nlev, int nproma, bool print=tru
 
 }
 
+#if defined(KOKKOS_ENABLE_CUDA) or defined(KOKKOS_ENABLE_HIP)
+    #define gpu 1
+    using Layout = Kokkos::LayoutLeft;
+#else
+    #undef gpu
+    using Layout = Kokkos::LayoutRight;
+#endif
 
-void scenario_5(double* array, int nblocks, int nlev, int nproma, bool print=true) {
+template <class T> struct LoopFunctor {
+    Kokkos::View<T**,Kokkos::LayoutStride> view;
+    int nproma, nlev, nblocks, jb;
+
+    LoopFunctor(int nproma, int nlev, int nblocks, int jb, Kokkos::View<T***,Kokkos::LayoutStride> d_view) :
+        nproma(nproma), nlev(nlev), nblocks(nblocks), jb(jb) {
+            view = Kokkos::subview(d_view, Kokkos::ALL, Kokkos::ALL, jb);
+        }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (const int jc) const {
+        for (int jk = 0; jk < nlev; ++jk) {    
+            #if defined(gpu)
+                int p = jb * nlev * nproma + jk * nproma + jc;  
+            #else 
+                int p = jc * nlev * nblocks + jk * nblocks + jb; 
+            #endif  
+             view(jc, jk) = p;
+        }
+    }
+};
 
+void scenario_5(double* array, int nblocks, int nlev, int nproma, bool print=true) {
     if(print)
-        std::cout << "Default layout; view(array, nproma, nblocks, nlev); d_view(jc, jb, jk) ----- KOMISCH" << std::endl;
+        std::cout << "scenario 5: Adaptable layout & functor & subview (array, nproma, nlev);" << std::endl;
 
-    Kokkos::View<double***, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nblocks, nlev);
+    Kokkos::View<double***, Layout, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks);
  
     using space_t = Kokkos::DefaultExecutionSpace::memory_space;
     auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view);
@@ -236,6 +260,49 @@ void scenario_6(double* array, int nblocks, int nlev, int nproma, bool print=tru
     validate(array, nblocks, nlev, nproma);
 }
 
+void scenario_6(double* array, int nblocks, int nlev, int nproma, bool print=true) {
+    if(print)
+        std::cout << "scenario 6: Adaptable Layout &  Hierarchical parallelism" << std::endl;
+
+    Kokkos::View<double***, Layout, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks);
+ 
+    using space_t = Kokkos::DefaultExecutionSpace::memory_space;
+    auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view);
+    
+    
+    using team_policy = Kokkos::TeamPolicy<>;
+    using member_type = Kokkos::TeamPolicy<>::member_type;
+
+    timer.reset();
+ 
+    Kokkos::parallel_for("blocks", team_policy(nblocks, Kokkos::AUTO),
+        KOKKOS_LAMBDA (const member_type &teamMember) {
+            const int jb = teamMember.league_rank();
+
+            Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, nproma), 
+                [&] (const int jc) {
+
+                    // sequential over the levels
+                    for (int jk = 0; jk < nlev; ++jk) {     
+                        #if defined(gpu)
+                            int p = jb * nlev * nproma + jk * nproma + jc;  
+                        #else 
+                            int p = jc * nlev * nblocks + jk * nblocks + jb; 
+                        #endif
+
+                        d_view(jc, jk, jb) = p;
+                    }
+                });
+        });
+
+    Kokkos::fence();
+    
+    if(print)
+        printf("Time = %f ms\n\n", timer.seconds() * 1000);
+    Kokkos::deep_copy(view, d_view);
+    validate(array, nblocks, nlev, nproma);
+}
+
 
 int main() {
 
diff --git a/script.sh b/script.sh
index 43ae273..d97fd6a 100755
--- a/script.sh
+++ b/script.sh
@@ -8,17 +8,21 @@ ulimit -s unlimited
 
 if [ "$1" == 'gpu' ]
 then
+    rm -rf build_gpu
     cmake -B build_gpu -S . -DMU_ARCH=a100 -DCMAKE_CXX_FLAGS="-O3"
     cmake --build build_gpu --parallel 
+
     ncells=(5000000)
     nlev=(90)
     nproma=(5000000)
 else 
+    rm -rf build
     cmake -B build -S . -DMU_ARCH=x86_64 -DCMAKE_CXX_FLAGS="-O3"
     cmake --build build --parallel
+
     ncells=(5000000)
     nlev=(90)   
-    nproma=(32) # 64 96 128)
+    nproma=(32 64 96 128)
 
     export OMP_PROC_BIND=close
     export OMP_PLACES=cores
-- 
GitLab


From d9e2acfa976f1e695af0077abdb83bc5451a7aff Mon Sep 17 00:00:00 2001
From: Georgiana Mania <mania@dkrz.de>
Date: Thu, 27 Feb 2025 11:54:45 +0100
Subject: [PATCH 5/6] add Dmitry's solution

---
 main.cpp | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/main.cpp b/main.cpp
index 7d1711c..c1015f2 100644
--- a/main.cpp
+++ b/main.cpp
@@ -173,6 +173,38 @@ void scenario_4(double* array, int nblocks, int nlev, int nproma, bool print=tru
 
 }
 
+void scenario_4b(double* array, int nblocks, int nlev, int nproma, bool print = true) {
+  if (print)
+    std::cout << "scenario 4b (Dmitry's solution): view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- "
+              << std::endl;
+
+  Kokkos::View<double***, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks);
+
+  using space_t   = Kokkos::DefaultExecutionSpace::memory_space;
+  auto d_view_tmp = Kokkos::create_mirror_view_and_copy(space_t(), view);
+  Kokkos::View<double***, space_t> d_view("d_view", nproma, nlev, nblocks);
+  Kokkos::deep_copy(d_view_tmp, view);
+  Kokkos::deep_copy(d_view, d_view_tmp);
+
+  timer.reset();
+  for (int jb = 0; jb < nblocks; ++jb)
+    Kokkos::parallel_for(
+        "", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA(const int jc) {
+          for (int jk = 0; jk < nlev; ++jk) {
+            int p              = jc * nlev * nblocks + jk * nblocks + jb;
+            d_view(jc, jk, jb) = p;
+
+            //  printf("%f ", d_view(jb, jk, jc));
+          }
+        });
+
+  Kokkos::fence();
+  if (print) printf("Time = %f ms\n\n", timer.seconds() * 1000);
+  Kokkos::deep_copy(d_view_tmp, d_view);
+  Kokkos::deep_copy(view, d_view_tmp);
+  validate(array, nblocks, nlev, nproma);
+}
+
 #if defined(KOKKOS_ENABLE_CUDA) or defined(KOKKOS_ENABLE_HIP)
     #define gpu 1
     using Layout = Kokkos::LayoutLeft;
@@ -346,6 +378,7 @@ int main() {
     scenario_2b(array, nblocks, nlev, nproma);
     scenario_3(array, nblocks, nlev, nproma);
     scenario_4(array, nblocks, nlev, nproma);
+    scenario_4b(array, nblocks, nlev, nproma);
     scenario_5(array, nblocks, nlev, nproma);
     scenario_6(array, nblocks, nlev, nproma);
 
-- 
GitLab


From 10df2da87245e1bb292f5e3fb4a708468237236e Mon Sep 17 00:00:00 2001
From: Georgiana Mania <mania@dkrz.de>
Date: Thu, 27 Feb 2025 13:52:40 +0100
Subject: [PATCH 6/6] rebase and fix conflicts

---
 main.cpp | 506 ++++++++++++++++++++++++++-----------------------------
 1 file changed, 235 insertions(+), 271 deletions(-)

diff --git a/main.cpp b/main.cpp
index c1015f2..bf48c91 100644
--- a/main.cpp
+++ b/main.cpp
@@ -1,176 +1,180 @@
+#include <Kokkos_Core.hpp>
+#include <cassert>
 #include <iostream>
 #include <utility>
-#include <Kokkos_Core.hpp>
+
 #include "Kokkos_Timer.hpp"
-#include <cassert>
 
 using space_t = Kokkos::DefaultExecutionSpace::memory_space;
 typedef Kokkos::MDRangePolicy<Kokkos::DefaultExecutionSpace, Kokkos::IndexType<int>, Kokkos::Rank<3>> md_range_policy;
 
- Kokkos::Timer timer;
-   
+Kokkos::Timer timer;
 
-//constexpr int nblocks = 2;
-//constexpr int nlev = 90;
-//constexpr int nproma = 55000;
+// constexpr int nblocks = 2;
+// constexpr int nlev = 90;
+// constexpr int nproma = 55000;
 
-//#define ENABLE_CHECK_BOUNDS
+// #define ENABLE_CHECK_BOUNDS
 
 static void validate(double* array, int nblocks, int nlev, int nproma) {
-    for (int i = 0; i < nblocks * nlev * nproma; ++i)  {
-       assert(array[i] == static_cast<double>(i));
-    }
+  for (int i = 0; i < nblocks * nlev * nproma; ++i) {
+    assert(array[i] == static_cast<double>(i));
+  }
 }
 
 inline void check_bounds(int i1, int i2, int i3, int n1, int n2, int n3) {
 #ifdef ENABLE_CHECK_BOUNDS
-  assert(i1 >=0 && i2 >= 0 && i3 >= 0 &&
-         i1 < n1 && i2 < n2 && i3 < n3);
+  assert(i1 >= 0 && i2 >= 0 && i3 >= 0 && i1 < n1 && i2 < n2 && i3 < n3);
 #endif
 }
 
-void scenario_1(double* array, int nblocks, int nlev, int nproma, bool print=true) {
-    if(print)
-        std::cout << "scenario 1: Default layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- " << std::endl;
+void scenario_1(double* array, int nblocks, int nlev, int nproma, bool print = true) {
+  if (print)
+    std::cout << "scenario 1: Default layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- "
+              << std::endl;
 
-    Kokkos::View<double***, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nblocks, nlev, nproma);
+  Kokkos::View<double***, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nblocks, nlev, nproma);
 
-    using space_t = Kokkos::DefaultExecutionSpace::memory_space;
-    auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view);
+  using space_t = Kokkos::DefaultExecutionSpace::memory_space;
+  auto d_view   = Kokkos::create_mirror_view_and_copy(space_t(), view);
 
-    timer.reset();
+  timer.reset();
 
-    for (int jb = 0 ; jb < nblocks; ++jb) 
-    Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) {
-        for (int jk = 0; jk < nlev; ++jk) {
-            int p = jb * nlev * nproma + jk * nproma + jc;  
-            check_bounds(jb,jk,jc, d_view.extent(0), d_view.extent(1), d_view.extent(2));
+  for (int jb = 0; jb < nblocks; ++jb)
+    Kokkos::parallel_for(
+        "", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA(const int jc) {
+          for (int jk = 0; jk < nlev; ++jk) {
+            int p = jb * nlev * nproma + jk * nproma + jc;
+            check_bounds(jb, jk, jc, d_view.extent(0), d_view.extent(1), d_view.extent(2));
             d_view(jb, jk, jc) = p;
-        }});
-    Kokkos::fence();
-    
-    Kokkos::fence();
-    if(print)
-        printf("Time = %f ms\n\n", timer.seconds() * 1000);
-    Kokkos::deep_copy(view, d_view);
-
-    validate(array, nblocks, nlev, nproma);
+          }
+        });
+  Kokkos::fence();
+
+  Kokkos::fence();
+  if (print) printf("Time = %f ms\n\n", timer.seconds() * 1000);
+  Kokkos::deep_copy(view, d_view);
+
+  validate(array, nblocks, nlev, nproma);
 }
 
-void scenario_2(double* array, int nblocks, int nlev, int nproma, bool print=true) {
-    if(print)
-        std::cout << "scenario 2: Right layout; view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- " << std::endl;
+void scenario_2(double* array, int nblocks, int nlev, int nproma, bool print = true) {
+  if (print)
+    std::cout << "scenario 2: Right layout; view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- " << std::endl;
 
-    Kokkos::View<double***, Kokkos::LayoutRight, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks);
- 
-    using space_t = Kokkos::DefaultExecutionSpace::memory_space;
-    auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view);
-    
-    timer.reset();
+  Kokkos::View<double***, Kokkos::LayoutRight, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev,
+                                                                                                nblocks);
 
-    for (int jb = 0 ; jb < nblocks; ++jb) 
-    Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) {
-        for (int jk = 0; jk < nlev; ++jk) {
+  using space_t = Kokkos::DefaultExecutionSpace::memory_space;
+  auto d_view   = Kokkos::create_mirror_view_and_copy(space_t(), view);
+
+  timer.reset();
+
+  for (int jb = 0; jb < nblocks; ++jb)
+    Kokkos::parallel_for(
+        "", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA(const int jc) {
+          for (int jk = 0; jk < nlev; ++jk) {
             int p = jc * nlev * nblocks + jk * nblocks + jb;
-            check_bounds(jc,jk,jb, d_view.extent(0), d_view.extent(1), d_view.extent(2));
+            check_bounds(jc, jk, jb, d_view.extent(0), d_view.extent(1), d_view.extent(2));
             d_view(jc, jk, jb) = p;
 
-          //  printf("%f ", d_view(jb, jk, jc));      
-        }});
-    
-    Kokkos::fence();
-    if(print)
-        printf("Time = %f ms\n\n", timer.seconds() * 1000);
-    Kokkos::deep_copy(view, d_view);
-    validate(array, nblocks, nlev, nproma);
+            //  printf("%f ", d_view(jb, jk, jc));
+          }
+        });
 
+  Kokkos::fence();
+  if (print) printf("Time = %f ms\n\n", timer.seconds() * 1000);
+  Kokkos::deep_copy(view, d_view);
+  validate(array, nblocks, nlev, nproma);
 }
 
-void scenario_2b(double* array, int nblocks, int nlev, int nproma, bool print=true) {
-    if(print)
-        std::cout << "scenario 2b: Right 2b layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- " << std::endl;
+void scenario_2b(double* array, int nblocks, int nlev, int nproma, bool print = true) {
+  if (print)
+    std::cout << "scenario 2b: Right 2b layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- "
+              << std::endl;
 
-    Kokkos::View<double***, Kokkos::LayoutRight, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nblocks, nlev, nproma);
- 
-    using space_t = Kokkos::DefaultExecutionSpace::memory_space;
-    auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view);
-    
-    timer.reset();
+  Kokkos::View<double***, Kokkos::LayoutRight, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nblocks, nlev,
+                                                                                                nproma);
 
-    for (int jb = 0 ; jb < nblocks; ++jb) 
-    Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) {
-        for (int jk = 0; jk < nlev; ++jk) {
-            int p = jb * nlev * nproma + jk * nproma + jc;  
-            check_bounds(jb,jk,jc, d_view.extent(0), d_view.extent(1), d_view.extent(2));
+  using space_t = Kokkos::DefaultExecutionSpace::memory_space;
+  auto d_view   = Kokkos::create_mirror_view_and_copy(space_t(), view);
+
+  timer.reset();
+
+  for (int jb = 0; jb < nblocks; ++jb)
+    Kokkos::parallel_for(
+        "", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA(const int jc) {
+          for (int jk = 0; jk < nlev; ++jk) {
+            int p = jb * nlev * nproma + jk * nproma + jc;
+            check_bounds(jb, jk, jc, d_view.extent(0), d_view.extent(1), d_view.extent(2));
             d_view(jb, jk, jc) = p;
 
-          //  printf("%f ", d_view(jb, jk, jc));      
-        }});
-    
-    Kokkos::fence();    
-    if(print)
-        printf("Time = %f ms\n\n", timer.seconds() * 1000);
-    Kokkos::deep_copy(view, d_view);
-    validate(array, nblocks, nlev, nproma);
+            //  printf("%f ", d_view(jb, jk, jc));
+          }
+        });
 
+  Kokkos::fence();
+  if (print) printf("Time = %f ms\n\n", timer.seconds() * 1000);
+  Kokkos::deep_copy(view, d_view);
+  validate(array, nblocks, nlev, nproma);
 }
 
 // slow on CPU
-void scenario_3(double* array, int nblocks, int nlev, int nproma, bool print=true) {
-
-    if(print)
-        std::cout << "scenario 3: Left layout; view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- " << std::endl;
-
-    Kokkos::View<double***, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks);
- 
-    using space_t = Kokkos::DefaultExecutionSpace::memory_space;
-    auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view);
-    
-    timer.reset();
-    for (int jb = 0 ; jb < nblocks; ++jb) 
-    Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) {
-        for (int jk = 0; jk < nlev; ++jk) {
-            int p = jb * nlev * nproma + jk * nproma + jc;  
-            check_bounds(jc,jk,jb, d_view.extent(0), d_view.extent(1), d_view.extent(2));
+void scenario_3(double* array, int nblocks, int nlev, int nproma, bool print = true) {
+  if (print)
+    std::cout << "scenario 3: Left layout; view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- " << std::endl;
+
+  Kokkos::View<double***, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev,
+                                                                                               nblocks);
+
+  using space_t = Kokkos::DefaultExecutionSpace::memory_space;
+  auto d_view   = Kokkos::create_mirror_view_and_copy(space_t(), view);
+
+  timer.reset();
+  for (int jb = 0; jb < nblocks; ++jb)
+    Kokkos::parallel_for(
+        "", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA(const int jc) {
+          for (int jk = 0; jk < nlev; ++jk) {
+            int p = jb * nlev * nproma + jk * nproma + jc;
+            check_bounds(jc, jk, jb, d_view.extent(0), d_view.extent(1), d_view.extent(2));
             d_view(jc, jk, jb) = p;
 
-          //  printf("%f ", d_view(jb, jk, jc));      
-        }});
-    
-    Kokkos::fence();
-    if(print)
-        printf("Time = %f ms\n\n", timer.seconds() * 1000);
-    Kokkos::deep_copy(view, d_view);
-    validate(array, nblocks, nlev, nproma);
+            //  printf("%f ", d_view(jb, jk, jc));
+          }
+        });
 
+  Kokkos::fence();
+  if (print) printf("Time = %f ms\n\n", timer.seconds() * 1000);
+  Kokkos::deep_copy(view, d_view);
+  validate(array, nblocks, nlev, nproma);
 }
 
-void scenario_4(double* array, int nblocks, int nlev, int nproma, bool print=true) {
+void scenario_4(double* array, int nblocks, int nlev, int nproma, bool print = true) {
+  if (print)
+    std::cout << "scenario 4: Default layout; view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- "
+              << std::endl;
+
+  Kokkos::View<double***, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks);
 
-    if(print)
-        std::cout << "scenario 4: Default layout; view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- " << std::endl;
+  using space_t = Kokkos::DefaultExecutionSpace::memory_space;
+  auto d_view   = Kokkos::create_mirror_view_and_copy(space_t(), view);
 
-    Kokkos::View<double***, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks);
- 
-    using space_t = Kokkos::DefaultExecutionSpace::memory_space;
-    auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view);
-    
-    timer.reset();
-    for (int jb = 0 ; jb < nblocks; ++jb) 
-    Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) {
-        for (int jk = 0; jk < nlev; ++jk) {
+  timer.reset();
+  for (int jb = 0; jb < nblocks; ++jb)
+    Kokkos::parallel_for(
+        "", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA(const int jc) {
+          for (int jk = 0; jk < nlev; ++jk) {
             int p = jc * nlev * nblocks + jk * nblocks + jb;
-            check_bounds(jc,jk,jb, d_view.extent(0), d_view.extent(1), d_view.extent(2));
+            check_bounds(jc, jk, jb, d_view.extent(0), d_view.extent(1), d_view.extent(2));
             d_view(jc, jk, jb) = p;
 
-          //  printf("%f ", d_view(jb, jk, jc));      
-        }});
-    Kokkos::fence();
-    if(print)
-        printf("Time = %f ms\n\n", timer.seconds() * 1000);
-    Kokkos::deep_copy(view, d_view);
-    validate(array, nblocks, nlev, nproma);
-
+            //  printf("%f ", d_view(jb, jk, jc));
+          }
+        });
+  Kokkos::fence();
+  if (print) printf("Time = %f ms\n\n", timer.seconds() * 1000);
+  Kokkos::deep_copy(view, d_view);
+  validate(array, nblocks, nlev, nproma);
 }
 
 void scenario_4b(double* array, int nblocks, int nlev, int nproma, bool print = true) {
@@ -206,171 +210,132 @@ void scenario_4b(double* array, int nblocks, int nlev, int nproma, bool print =
 }
 
 #if defined(KOKKOS_ENABLE_CUDA) or defined(KOKKOS_ENABLE_HIP)
-    #define gpu 1
-    using Layout = Kokkos::LayoutLeft;
+#define gpu 1
+using Layout = Kokkos::LayoutLeft;
 #else
-    #undef gpu
-    using Layout = Kokkos::LayoutRight;
+#undef gpu
+using Layout = Kokkos::LayoutRight;
 #endif
 
 template <class T> struct LoopFunctor {
-    Kokkos::View<T**,Kokkos::LayoutStride> view;
-    int nproma, nlev, nblocks, jb;
-
-    LoopFunctor(int nproma, int nlev, int nblocks, int jb, Kokkos::View<T***,Kokkos::LayoutStride> d_view) :
-        nproma(nproma), nlev(nlev), nblocks(nblocks), jb(jb) {
-            view = Kokkos::subview(d_view, Kokkos::ALL, Kokkos::ALL, jb);
-        }
-
-    KOKKOS_INLINE_FUNCTION
-    void operator() (const int jc) const {
-        for (int jk = 0; jk < nlev; ++jk) {    
-            #if defined(gpu)
-                int p = jb * nlev * nproma + jk * nproma + jc;  
-            #else 
-                int p = jc * nlev * nblocks + jk * nblocks + jb; 
-            #endif  
-             view(jc, jk) = p;
-        }
+  Kokkos::View<T**, Kokkos::LayoutStride> view;
+  int nproma, nlev, nblocks, jb;
+
+  LoopFunctor(int nproma, int nlev, int nblocks, int jb, Kokkos::View<T***, Kokkos::LayoutStride> d_view)
+      : nproma(nproma), nlev(nlev), nblocks(nblocks), jb(jb) {
+    view = Kokkos::subview(d_view, Kokkos::ALL, Kokkos::ALL, jb);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int jc) const {
+    for (int jk = 0; jk < nlev; ++jk) {
+#if defined(gpu)
+      int p = jb * nlev * nproma + jk * nproma + jc;
+#else
+      int p = jc * nlev * nblocks + jk * nblocks + jb;
+#endif
+      view(jc, jk) = p;
     }
+  }
 };
 
-void scenario_5(double* array, int nblocks, int nlev, int nproma, bool print=true) {
-    if(print)
-        std::cout << "scenario 5: Adaptable layout & functor & subview (array, nproma, nlev);" << std::endl;
-
-    Kokkos::View<double***, Layout, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks);
- 
-    using space_t = Kokkos::DefaultExecutionSpace::memory_space;
-    auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view);
-    
-    timer.reset();
-    for (int jb = 0 ; jb < nblocks; ++jb) 
-    Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) {
-        for (int jk = 0; jk < nlev; ++jk) {
-            int p = jc * nlev * nblocks + jb * nlev + jk;
-            d_view(jc, jb, jk) = p;
+void scenario_5(double* array, int nblocks, int nlev, int nproma, bool print = true) {
+  if (print) std::cout << "scenario 5: Adaptable layout & functor & subview (array, nproma, nlev);" << std::endl;
 
-          //  printf("%f ", d_view(jb, jk, jc));      
-        }});
+  Kokkos::View<double***, Layout, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks);
+
+  using space_t = Kokkos::DefaultExecutionSpace::memory_space;
+  auto d_view   = Kokkos::create_mirror_view_and_copy(space_t(), view);
+
+  timer.reset();
+  for (int jb = 0; jb < nblocks; ++jb)
+    Kokkos::parallel_for(
+        "", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA(const int jc) {
+          for (int jk = 0; jk < nlev; ++jk) {
+            int p              = jc * nlev * nblocks + jb * nlev + jk;
+            d_view(jc, jb, jk) = p;
 
-    Kokkos::fence();
-    if(print)
-        printf("Time = %f ms\n\n", timer.seconds() * 1000);
-    Kokkos::deep_copy(view, d_view);
-    validate(array, nblocks, nlev, nproma);
+            //  printf("%f ", d_view(jb, jk, jc));
+          }
+        });
 
+  Kokkos::fence();
+  if (print) printf("Time = %f ms\n\n", timer.seconds() * 1000);
+  Kokkos::deep_copy(view, d_view);
+  validate(array, nblocks, nlev, nproma);
 }
 
+void scenario_6(double* array, int nblocks, int nlev, int nproma, bool print = true) {
+  if (print) std::cout << "scenario 6: Adaptable Layout &  Hierarchical parallelism" << std::endl;
 
-void scenario_6(double* array, int nblocks, int nlev, int nproma, bool print=true) {
+  Kokkos::View<double***, Layout, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks);
 
-    if(print)
-        std::cout << "Default layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- " << std::endl;
+  using space_t     = Kokkos::DefaultExecutionSpace::memory_space;
+  auto d_view       = Kokkos::create_mirror_view_and_copy(space_t(), view);
 
-    Kokkos::View<double***, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nblocks, nlev, nproma);
- 
-    using space_t = Kokkos::DefaultExecutionSpace::memory_space;
-    auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view);
-    
-    timer.reset();
-    Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nblocks), KOKKOS_LAMBDA (const int jb) {
-        //for (int jb = 0 ; jb < nblocks; ++jb) {
-      for (int jc = 0 ; jc < nproma; ++jc) {
-        for (int jk = 0; jk < nlev; ++jk) {
+  using team_policy = Kokkos::TeamPolicy<>;
+  using member_type = Kokkos::TeamPolicy<>::member_type;
+
+  timer.reset();
+
+  Kokkos::parallel_for(
+      "blocks", team_policy(nblocks, Kokkos::AUTO), KOKKOS_LAMBDA(const member_type& teamMember) {
+        const int jb = teamMember.league_rank();
+
+        Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, nproma), [&](const int jc) {
+          // sequential over the levels
+          for (int jk = 0; jk < nlev; ++jk) {
+#if defined(gpu)
             int p = jb * nlev * nproma + jk * nproma + jc;
-            check_bounds(jb,jk,jc, d_view.extent(0), d_view.extent(1), d_view.extent(2));
-            d_view(jb, jk, jc) = p;
-            //  printf("%f ", d_view(jb, jk, jc));      
+#else 
+                            int p = jc * nlev * nblocks + jk * nblocks + jb;
+#endif
+
+            d_view(jc, jk, jb) = p;
           }
-        }
+        });
       });
 
-    if(print)
-        printf("Time = %f ms\n\n", timer.seconds() * 1000);
-    Kokkos::deep_copy(view, d_view);
-    validate(array, nblocks, nlev, nproma);
-}
-
-void scenario_6(double* array, int nblocks, int nlev, int nproma, bool print=true) {
-    if(print)
-        std::cout << "scenario 6: Adaptable Layout &  Hierarchical parallelism" << std::endl;
-
-    Kokkos::View<double***, Layout, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks);
- 
-    using space_t = Kokkos::DefaultExecutionSpace::memory_space;
-    auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view);
-    
-    
-    using team_policy = Kokkos::TeamPolicy<>;
-    using member_type = Kokkos::TeamPolicy<>::member_type;
-
-    timer.reset();
- 
-    Kokkos::parallel_for("blocks", team_policy(nblocks, Kokkos::AUTO),
-        KOKKOS_LAMBDA (const member_type &teamMember) {
-            const int jb = teamMember.league_rank();
-
-            Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, nproma), 
-                [&] (const int jc) {
-
-                    // sequential over the levels
-                    for (int jk = 0; jk < nlev; ++jk) {     
-                        #if defined(gpu)
-                            int p = jb * nlev * nproma + jk * nproma + jc;  
-                        #else 
-                            int p = jc * nlev * nblocks + jk * nblocks + jb; 
-                        #endif
-
-                        d_view(jc, jk, jb) = p;
-                    }
-                });
-        });
+  Kokkos::fence();
 
-    Kokkos::fence();
-    
-    if(print)
-        printf("Time = %f ms\n\n", timer.seconds() * 1000);
-    Kokkos::deep_copy(view, d_view);
-    validate(array, nblocks, nlev, nproma);
+  if (print) printf("Time = %f ms\n\n", timer.seconds() * 1000);
+  Kokkos::deep_copy(view, d_view);
+  validate(array, nblocks, nlev, nproma);
 }
 
-
 int main() {
+  int ncells  = atoi(std::getenv("NCELLS"));
+  int nlev    = atoi(std::getenv("NLEV"));
+  int nproma  = atoi(std::getenv("NPROMA"));
+  int nblocks = (ncells - 1) / nproma + 1;
+
+  std::cout << "nblocks=" << nblocks << ", nlev=" << nlev << ", nproma=" << nproma << std::endl;
+
+  double array[nblocks * nlev * nproma];
+  /*
+     for (int jb = 0; jb < nblocks; ++jb)
+         for (int jk = 0; jk < nlev; ++jk)
+             for (int jc = 0; jc < nproma; ++jc) {
+                 int p = jb * nlev * nproma + jk * nproma + jc;
+                 array[p] = 1; //static_cast<double>(p);
+             }
+ */
+  /*
+      for (int i = 0; i < nblocks * nlev * nproma; ++i)
+          std::cout << array[i] << " " ;
+      std::cout << "\n";
+
+
+     for (int jb = 0; jb < nblocks; ++jb)
+          for (int jk = 0; jk < nlev; ++jk){
+              for (int jc = 0; jc < nproma; ++jc)
+                  std::cout << view(jb, jk, jc)<< " ";
+            std::cout << "\n";
+          }
+  */
 
-    int ncells = atoi(std::getenv("NCELLS"));
-    int nlev = atoi(std::getenv("NLEV"));
-    int nproma = atoi(std::getenv("NPROMA"));
-    int nblocks = (ncells - 1) / nproma + 1;
-
-    std::cout << "nblocks=" << nblocks << ", nlev=" << nlev << ", nproma=" << nproma << std::endl;
-
-    double array[nblocks * nlev * nproma];
- /*   
-    for (int jb = 0; jb < nblocks; ++jb)
-        for (int jk = 0; jk < nlev; ++jk)
-            for (int jc = 0; jc < nproma; ++jc) {
-                int p = jb * nlev * nproma + jk * nproma + jc;
-                array[p] = 1; //static_cast<double>(p);
-            }
-*/
-/*
-    for (int i = 0; i < nblocks * nlev * nproma; ++i) 
-        std::cout << array[i] << " " ;
-    std::cout << "\n";     
-
-
-   for (int jb = 0; jb < nblocks; ++jb)
-        for (int jk = 0; jk < nlev; ++jk){
-            for (int jc = 0; jc < nproma; ++jc) 
-                std::cout << view(jb, jk, jc)<< " ";
-          std::cout << "\n";
-        }
-*/
-
-    Kokkos::initialize();
-{
-
+  Kokkos::initialize();
+  {
     scenario_1(array, nblocks, nlev, nproma, false);
 
     scenario_1(array, nblocks, nlev, nproma);
@@ -381,26 +346,25 @@ int main() {
     scenario_4b(array, nblocks, nlev, nproma);
     scenario_5(array, nblocks, nlev, nproma);
     scenario_6(array, nblocks, nlev, nproma);
+  }
+  Kokkos::finalize();
 
-}
-    Kokkos::finalize();
-
-    return 0;
+  return 0;
 }
 
 /**
- * 
+ *
  * #if 0
     Kokkos::parallel_for(
       "print", md_range_policy({0, 0, 0}, {nblocks, nlev, nproma}),
       KOKKOS_LAMBDA(const int jb, const int jk, const int jc) {
             int p = jb * nlev * nproma + jk * nproma + jc;
             d_view(jb, jk, jc) += p;
-            printf("%f ", d_view(jb, jk, jc));        
+            printf("%f ", d_view(jb, jk, jc));
     });
     std::cout << "\n";
 #endif
-    for (int jb = 0 ; jb < nblocks; ++jb) 
+    for (int jb = 0 ; jb < nblocks; ++jb)
     Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) {
         for (int jk = 0; jk < nlev; ++jk) {
           //  int p = jb * nlev * nproma + jk * nproma + jc;  left
@@ -408,6 +372,6 @@ int main() {
            // d_view(jb, jk, jc) = p;
             d_view(jc, jk, jb) = p;
 
-          //  printf("%f ", d_view(jb, jk, jc));      
+          //  printf("%f ", d_view(jb, jk, jc));
         }});
 */
-- 
GitLab