From 1a2596ea481a212a14cc19f92fffb50c7e1e8f5a Mon Sep 17 00:00:00 2001 From: Georgiana Mania <mania@dkrz.de> Date: Wed, 26 Feb 2025 15:10:55 +0100 Subject: [PATCH 1/6] add kokkos::fence() as recommended by Sergey/Dmitry --- main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.cpp b/main.cpp index 2b5bbd1..825a782 100644 --- a/main.cpp +++ b/main.cpp @@ -46,6 +46,7 @@ void scenario_1(double* array, int nblocks, int nlev, int nproma, bool print=tru check_bounds(jb,jk,jc, d_view.extent(0), d_view.extent(1), d_view.extent(2)); d_view(jb, jk, jc) = p; }}); + Kokkos::fence(); Kokkos::fence(); if(print) @@ -163,7 +164,6 @@ void scenario_4(double* array, int nblocks, int nlev, int nproma, bool print=tru // printf("%f ", d_view(jb, jk, jc)); }}); - Kokkos::fence(); if(print) printf("Time = %f ms\n\n", timer.seconds() * 1000); -- GitLab From 2b768d093d39c92c7d54653b8c186859da3f2c9d Mon Sep 17 00:00:00 2001 From: Georgiana Mania <mania@dkrz.de> Date: Thu, 27 Feb 2025 10:51:39 +0100 Subject: [PATCH 2/6] add scenario 5; fix wrong prints --- main.cpp | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/main.cpp b/main.cpp index 825a782..0ec21ea 100644 --- a/main.cpp +++ b/main.cpp @@ -1,4 +1,5 @@ #include <iostream> +#include <utility> #include <Kokkos_Core.hpp> #include "Kokkos_Timer.hpp" #include <cassert> @@ -30,7 +31,7 @@ inline void check_bounds(int i1, int i2, int i3, int n1, int n2, int n3) { void scenario_1(double* array, int nblocks, int nlev, int nproma, bool print=true) { if(print) - std::cout << "Default layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- " << std::endl; + std::cout << "scenario 1: Default layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- " << std::endl; Kokkos::View<double***, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nblocks, nlev, nproma); @@ -58,7 +59,11 @@ void scenario_1(double* array, int nblocks, int nlev, int nproma, bool print=tru void scenario_2(double* array, int nblocks, int nlev, int nproma, bool print=true) { if(print) +<<<<<<< HEAD std::cout << "Right layout; view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- " << std::endl; +======= + std::cout << "scenario 2: Right layout; view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- " << std::endl; +>>>>>>> cb80ea4 (add scenario 5; fix wrong prints) Kokkos::View<double***, Kokkos::LayoutRight, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks); @@ -87,7 +92,7 @@ void scenario_2(double* array, int nblocks, int nlev, int nproma, bool print=tru void scenario_2b(double* array, int nblocks, int nlev, int nproma, bool print=true) { if(print) - std::cout << "Right 2b layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- " << std::endl; + std::cout << "scenario 2b: Right 2b layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- " << std::endl; Kokkos::View<double***, Kokkos::LayoutRight, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nblocks, nlev, nproma); @@ -118,7 +123,7 @@ void scenario_2b(double* array, int nblocks, int nlev, int nproma, bool print=tr void scenario_3(double* array, int nblocks, int nlev, int nproma, bool print=true) { if(print) - std::cout << "Left layout; view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- " << std::endl; + std::cout << "scenario 3: Left layout; view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- " << std::endl; Kokkos::View<double***, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks); @@ -147,7 +152,7 @@ void scenario_3(double* array, int nblocks, int nlev, int nproma, bool print=tru void scenario_4(double* array, int nblocks, int nlev, int nproma, bool print=true) { if(print) - std::cout << "Default layout; view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- " << std::endl; + std::cout << "scenario 4: Default layout; view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- " << std::endl; Kokkos::View<double***, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks); -- GitLab From 70ec1707edc6daf67e04794b99244b8c23585de8 Mon Sep 17 00:00:00 2001 From: Georgiana Mania <mania@dkrz.de> Date: Thu, 27 Feb 2025 11:40:54 +0100 Subject: [PATCH 3/6] fix kokkos gpu compile warning --- CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 413f35f..c4a4910 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,6 +12,8 @@ endif () # if using kokkos as shared library, -fPIC is needed set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC") +set(Kokkos_ENABLE_IMPL_MDSPAN OFF CACHE BOOL "Experimental mdspan support") + # configure kokkos 4.2 repository link FetchContent_Declare(kokkos URL https://github.com/kokkos/kokkos/releases/download/4.4.01/kokkos-4.4.01.tar.gz) -- GitLab From 0ca092d66205555ebcf63ffc83aaf346fb8db0ab Mon Sep 17 00:00:00 2001 From: Georgiana Mania <mania@dkrz.de> Date: Thu, 27 Feb 2025 11:41:12 +0100 Subject: [PATCH 4/6] add scenario 6 --- main.cpp | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++----- script.sh | 6 ++++- 2 files changed, 79 insertions(+), 8 deletions(-) diff --git a/main.cpp b/main.cpp index 0ec21ea..7d1711c 100644 --- a/main.cpp +++ b/main.cpp @@ -59,11 +59,7 @@ void scenario_1(double* array, int nblocks, int nlev, int nproma, bool print=tru void scenario_2(double* array, int nblocks, int nlev, int nproma, bool print=true) { if(print) -<<<<<<< HEAD - std::cout << "Right layout; view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- " << std::endl; -======= std::cout << "scenario 2: Right layout; view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- " << std::endl; ->>>>>>> cb80ea4 (add scenario 5; fix wrong prints) Kokkos::View<double***, Kokkos::LayoutRight, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks); @@ -177,13 +173,41 @@ void scenario_4(double* array, int nblocks, int nlev, int nproma, bool print=tru } +#if defined(KOKKOS_ENABLE_CUDA) or defined(KOKKOS_ENABLE_HIP) + #define gpu 1 + using Layout = Kokkos::LayoutLeft; +#else + #undef gpu + using Layout = Kokkos::LayoutRight; +#endif -void scenario_5(double* array, int nblocks, int nlev, int nproma, bool print=true) { +template <class T> struct LoopFunctor { + Kokkos::View<T**,Kokkos::LayoutStride> view; + int nproma, nlev, nblocks, jb; + + LoopFunctor(int nproma, int nlev, int nblocks, int jb, Kokkos::View<T***,Kokkos::LayoutStride> d_view) : + nproma(nproma), nlev(nlev), nblocks(nblocks), jb(jb) { + view = Kokkos::subview(d_view, Kokkos::ALL, Kokkos::ALL, jb); + } + + KOKKOS_INLINE_FUNCTION + void operator() (const int jc) const { + for (int jk = 0; jk < nlev; ++jk) { + #if defined(gpu) + int p = jb * nlev * nproma + jk * nproma + jc; + #else + int p = jc * nlev * nblocks + jk * nblocks + jb; + #endif + view(jc, jk) = p; + } + } +}; +void scenario_5(double* array, int nblocks, int nlev, int nproma, bool print=true) { if(print) - std::cout << "Default layout; view(array, nproma, nblocks, nlev); d_view(jc, jb, jk) ----- KOMISCH" << std::endl; + std::cout << "scenario 5: Adaptable layout & functor & subview (array, nproma, nlev);" << std::endl; - Kokkos::View<double***, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nblocks, nlev); + Kokkos::View<double***, Layout, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks); using space_t = Kokkos::DefaultExecutionSpace::memory_space; auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view); @@ -236,6 +260,49 @@ void scenario_6(double* array, int nblocks, int nlev, int nproma, bool print=tru validate(array, nblocks, nlev, nproma); } +void scenario_6(double* array, int nblocks, int nlev, int nproma, bool print=true) { + if(print) + std::cout << "scenario 6: Adaptable Layout & Hierarchical parallelism" << std::endl; + + Kokkos::View<double***, Layout, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks); + + using space_t = Kokkos::DefaultExecutionSpace::memory_space; + auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view); + + + using team_policy = Kokkos::TeamPolicy<>; + using member_type = Kokkos::TeamPolicy<>::member_type; + + timer.reset(); + + Kokkos::parallel_for("blocks", team_policy(nblocks, Kokkos::AUTO), + KOKKOS_LAMBDA (const member_type &teamMember) { + const int jb = teamMember.league_rank(); + + Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, nproma), + [&] (const int jc) { + + // sequential over the levels + for (int jk = 0; jk < nlev; ++jk) { + #if defined(gpu) + int p = jb * nlev * nproma + jk * nproma + jc; + #else + int p = jc * nlev * nblocks + jk * nblocks + jb; + #endif + + d_view(jc, jk, jb) = p; + } + }); + }); + + Kokkos::fence(); + + if(print) + printf("Time = %f ms\n\n", timer.seconds() * 1000); + Kokkos::deep_copy(view, d_view); + validate(array, nblocks, nlev, nproma); +} + int main() { diff --git a/script.sh b/script.sh index 43ae273..d97fd6a 100755 --- a/script.sh +++ b/script.sh @@ -8,17 +8,21 @@ ulimit -s unlimited if [ "$1" == 'gpu' ] then + rm -rf build_gpu cmake -B build_gpu -S . -DMU_ARCH=a100 -DCMAKE_CXX_FLAGS="-O3" cmake --build build_gpu --parallel + ncells=(5000000) nlev=(90) nproma=(5000000) else + rm -rf build cmake -B build -S . -DMU_ARCH=x86_64 -DCMAKE_CXX_FLAGS="-O3" cmake --build build --parallel + ncells=(5000000) nlev=(90) - nproma=(32) # 64 96 128) + nproma=(32 64 96 128) export OMP_PROC_BIND=close export OMP_PLACES=cores -- GitLab From d9e2acfa976f1e695af0077abdb83bc5451a7aff Mon Sep 17 00:00:00 2001 From: Georgiana Mania <mania@dkrz.de> Date: Thu, 27 Feb 2025 11:54:45 +0100 Subject: [PATCH 5/6] add Dmitry's solution --- main.cpp | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/main.cpp b/main.cpp index 7d1711c..c1015f2 100644 --- a/main.cpp +++ b/main.cpp @@ -173,6 +173,38 @@ void scenario_4(double* array, int nblocks, int nlev, int nproma, bool print=tru } +void scenario_4b(double* array, int nblocks, int nlev, int nproma, bool print = true) { + if (print) + std::cout << "scenario 4b (Dmitry's solution): view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- " + << std::endl; + + Kokkos::View<double***, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks); + + using space_t = Kokkos::DefaultExecutionSpace::memory_space; + auto d_view_tmp = Kokkos::create_mirror_view_and_copy(space_t(), view); + Kokkos::View<double***, space_t> d_view("d_view", nproma, nlev, nblocks); + Kokkos::deep_copy(d_view_tmp, view); + Kokkos::deep_copy(d_view, d_view_tmp); + + timer.reset(); + for (int jb = 0; jb < nblocks; ++jb) + Kokkos::parallel_for( + "", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA(const int jc) { + for (int jk = 0; jk < nlev; ++jk) { + int p = jc * nlev * nblocks + jk * nblocks + jb; + d_view(jc, jk, jb) = p; + + // printf("%f ", d_view(jb, jk, jc)); + } + }); + + Kokkos::fence(); + if (print) printf("Time = %f ms\n\n", timer.seconds() * 1000); + Kokkos::deep_copy(d_view_tmp, d_view); + Kokkos::deep_copy(view, d_view_tmp); + validate(array, nblocks, nlev, nproma); +} + #if defined(KOKKOS_ENABLE_CUDA) or defined(KOKKOS_ENABLE_HIP) #define gpu 1 using Layout = Kokkos::LayoutLeft; @@ -346,6 +378,7 @@ int main() { scenario_2b(array, nblocks, nlev, nproma); scenario_3(array, nblocks, nlev, nproma); scenario_4(array, nblocks, nlev, nproma); + scenario_4b(array, nblocks, nlev, nproma); scenario_5(array, nblocks, nlev, nproma); scenario_6(array, nblocks, nlev, nproma); -- GitLab From 10df2da87245e1bb292f5e3fb4a708468237236e Mon Sep 17 00:00:00 2001 From: Georgiana Mania <mania@dkrz.de> Date: Thu, 27 Feb 2025 13:52:40 +0100 Subject: [PATCH 6/6] rebase and fix conflicts --- main.cpp | 506 ++++++++++++++++++++++++++----------------------------- 1 file changed, 235 insertions(+), 271 deletions(-) diff --git a/main.cpp b/main.cpp index c1015f2..bf48c91 100644 --- a/main.cpp +++ b/main.cpp @@ -1,176 +1,180 @@ +#include <Kokkos_Core.hpp> +#include <cassert> #include <iostream> #include <utility> -#include <Kokkos_Core.hpp> + #include "Kokkos_Timer.hpp" -#include <cassert> using space_t = Kokkos::DefaultExecutionSpace::memory_space; typedef Kokkos::MDRangePolicy<Kokkos::DefaultExecutionSpace, Kokkos::IndexType<int>, Kokkos::Rank<3>> md_range_policy; - Kokkos::Timer timer; - +Kokkos::Timer timer; -//constexpr int nblocks = 2; -//constexpr int nlev = 90; -//constexpr int nproma = 55000; +// constexpr int nblocks = 2; +// constexpr int nlev = 90; +// constexpr int nproma = 55000; -//#define ENABLE_CHECK_BOUNDS +// #define ENABLE_CHECK_BOUNDS static void validate(double* array, int nblocks, int nlev, int nproma) { - for (int i = 0; i < nblocks * nlev * nproma; ++i) { - assert(array[i] == static_cast<double>(i)); - } + for (int i = 0; i < nblocks * nlev * nproma; ++i) { + assert(array[i] == static_cast<double>(i)); + } } inline void check_bounds(int i1, int i2, int i3, int n1, int n2, int n3) { #ifdef ENABLE_CHECK_BOUNDS - assert(i1 >=0 && i2 >= 0 && i3 >= 0 && - i1 < n1 && i2 < n2 && i3 < n3); + assert(i1 >= 0 && i2 >= 0 && i3 >= 0 && i1 < n1 && i2 < n2 && i3 < n3); #endif } -void scenario_1(double* array, int nblocks, int nlev, int nproma, bool print=true) { - if(print) - std::cout << "scenario 1: Default layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- " << std::endl; +void scenario_1(double* array, int nblocks, int nlev, int nproma, bool print = true) { + if (print) + std::cout << "scenario 1: Default layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- " + << std::endl; - Kokkos::View<double***, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nblocks, nlev, nproma); + Kokkos::View<double***, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nblocks, nlev, nproma); - using space_t = Kokkos::DefaultExecutionSpace::memory_space; - auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view); + using space_t = Kokkos::DefaultExecutionSpace::memory_space; + auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view); - timer.reset(); + timer.reset(); - for (int jb = 0 ; jb < nblocks; ++jb) - Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) { - for (int jk = 0; jk < nlev; ++jk) { - int p = jb * nlev * nproma + jk * nproma + jc; - check_bounds(jb,jk,jc, d_view.extent(0), d_view.extent(1), d_view.extent(2)); + for (int jb = 0; jb < nblocks; ++jb) + Kokkos::parallel_for( + "", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA(const int jc) { + for (int jk = 0; jk < nlev; ++jk) { + int p = jb * nlev * nproma + jk * nproma + jc; + check_bounds(jb, jk, jc, d_view.extent(0), d_view.extent(1), d_view.extent(2)); d_view(jb, jk, jc) = p; - }}); - Kokkos::fence(); - - Kokkos::fence(); - if(print) - printf("Time = %f ms\n\n", timer.seconds() * 1000); - Kokkos::deep_copy(view, d_view); - - validate(array, nblocks, nlev, nproma); + } + }); + Kokkos::fence(); + + Kokkos::fence(); + if (print) printf("Time = %f ms\n\n", timer.seconds() * 1000); + Kokkos::deep_copy(view, d_view); + + validate(array, nblocks, nlev, nproma); } -void scenario_2(double* array, int nblocks, int nlev, int nproma, bool print=true) { - if(print) - std::cout << "scenario 2: Right layout; view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- " << std::endl; +void scenario_2(double* array, int nblocks, int nlev, int nproma, bool print = true) { + if (print) + std::cout << "scenario 2: Right layout; view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- " << std::endl; - Kokkos::View<double***, Kokkos::LayoutRight, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks); - - using space_t = Kokkos::DefaultExecutionSpace::memory_space; - auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view); - - timer.reset(); + Kokkos::View<double***, Kokkos::LayoutRight, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, + nblocks); - for (int jb = 0 ; jb < nblocks; ++jb) - Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) { - for (int jk = 0; jk < nlev; ++jk) { + using space_t = Kokkos::DefaultExecutionSpace::memory_space; + auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view); + + timer.reset(); + + for (int jb = 0; jb < nblocks; ++jb) + Kokkos::parallel_for( + "", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA(const int jc) { + for (int jk = 0; jk < nlev; ++jk) { int p = jc * nlev * nblocks + jk * nblocks + jb; - check_bounds(jc,jk,jb, d_view.extent(0), d_view.extent(1), d_view.extent(2)); + check_bounds(jc, jk, jb, d_view.extent(0), d_view.extent(1), d_view.extent(2)); d_view(jc, jk, jb) = p; - // printf("%f ", d_view(jb, jk, jc)); - }}); - - Kokkos::fence(); - if(print) - printf("Time = %f ms\n\n", timer.seconds() * 1000); - Kokkos::deep_copy(view, d_view); - validate(array, nblocks, nlev, nproma); + // printf("%f ", d_view(jb, jk, jc)); + } + }); + Kokkos::fence(); + if (print) printf("Time = %f ms\n\n", timer.seconds() * 1000); + Kokkos::deep_copy(view, d_view); + validate(array, nblocks, nlev, nproma); } -void scenario_2b(double* array, int nblocks, int nlev, int nproma, bool print=true) { - if(print) - std::cout << "scenario 2b: Right 2b layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- " << std::endl; +void scenario_2b(double* array, int nblocks, int nlev, int nproma, bool print = true) { + if (print) + std::cout << "scenario 2b: Right 2b layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- " + << std::endl; - Kokkos::View<double***, Kokkos::LayoutRight, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nblocks, nlev, nproma); - - using space_t = Kokkos::DefaultExecutionSpace::memory_space; - auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view); - - timer.reset(); + Kokkos::View<double***, Kokkos::LayoutRight, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nblocks, nlev, + nproma); - for (int jb = 0 ; jb < nblocks; ++jb) - Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) { - for (int jk = 0; jk < nlev; ++jk) { - int p = jb * nlev * nproma + jk * nproma + jc; - check_bounds(jb,jk,jc, d_view.extent(0), d_view.extent(1), d_view.extent(2)); + using space_t = Kokkos::DefaultExecutionSpace::memory_space; + auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view); + + timer.reset(); + + for (int jb = 0; jb < nblocks; ++jb) + Kokkos::parallel_for( + "", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA(const int jc) { + for (int jk = 0; jk < nlev; ++jk) { + int p = jb * nlev * nproma + jk * nproma + jc; + check_bounds(jb, jk, jc, d_view.extent(0), d_view.extent(1), d_view.extent(2)); d_view(jb, jk, jc) = p; - // printf("%f ", d_view(jb, jk, jc)); - }}); - - Kokkos::fence(); - if(print) - printf("Time = %f ms\n\n", timer.seconds() * 1000); - Kokkos::deep_copy(view, d_view); - validate(array, nblocks, nlev, nproma); + // printf("%f ", d_view(jb, jk, jc)); + } + }); + Kokkos::fence(); + if (print) printf("Time = %f ms\n\n", timer.seconds() * 1000); + Kokkos::deep_copy(view, d_view); + validate(array, nblocks, nlev, nproma); } // slow on CPU -void scenario_3(double* array, int nblocks, int nlev, int nproma, bool print=true) { - - if(print) - std::cout << "scenario 3: Left layout; view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- " << std::endl; - - Kokkos::View<double***, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks); - - using space_t = Kokkos::DefaultExecutionSpace::memory_space; - auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view); - - timer.reset(); - for (int jb = 0 ; jb < nblocks; ++jb) - Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) { - for (int jk = 0; jk < nlev; ++jk) { - int p = jb * nlev * nproma + jk * nproma + jc; - check_bounds(jc,jk,jb, d_view.extent(0), d_view.extent(1), d_view.extent(2)); +void scenario_3(double* array, int nblocks, int nlev, int nproma, bool print = true) { + if (print) + std::cout << "scenario 3: Left layout; view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- " << std::endl; + + Kokkos::View<double***, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, + nblocks); + + using space_t = Kokkos::DefaultExecutionSpace::memory_space; + auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view); + + timer.reset(); + for (int jb = 0; jb < nblocks; ++jb) + Kokkos::parallel_for( + "", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA(const int jc) { + for (int jk = 0; jk < nlev; ++jk) { + int p = jb * nlev * nproma + jk * nproma + jc; + check_bounds(jc, jk, jb, d_view.extent(0), d_view.extent(1), d_view.extent(2)); d_view(jc, jk, jb) = p; - // printf("%f ", d_view(jb, jk, jc)); - }}); - - Kokkos::fence(); - if(print) - printf("Time = %f ms\n\n", timer.seconds() * 1000); - Kokkos::deep_copy(view, d_view); - validate(array, nblocks, nlev, nproma); + // printf("%f ", d_view(jb, jk, jc)); + } + }); + Kokkos::fence(); + if (print) printf("Time = %f ms\n\n", timer.seconds() * 1000); + Kokkos::deep_copy(view, d_view); + validate(array, nblocks, nlev, nproma); } -void scenario_4(double* array, int nblocks, int nlev, int nproma, bool print=true) { +void scenario_4(double* array, int nblocks, int nlev, int nproma, bool print = true) { + if (print) + std::cout << "scenario 4: Default layout; view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- " + << std::endl; + + Kokkos::View<double***, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks); - if(print) - std::cout << "scenario 4: Default layout; view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- " << std::endl; + using space_t = Kokkos::DefaultExecutionSpace::memory_space; + auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view); - Kokkos::View<double***, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks); - - using space_t = Kokkos::DefaultExecutionSpace::memory_space; - auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view); - - timer.reset(); - for (int jb = 0 ; jb < nblocks; ++jb) - Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) { - for (int jk = 0; jk < nlev; ++jk) { + timer.reset(); + for (int jb = 0; jb < nblocks; ++jb) + Kokkos::parallel_for( + "", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA(const int jc) { + for (int jk = 0; jk < nlev; ++jk) { int p = jc * nlev * nblocks + jk * nblocks + jb; - check_bounds(jc,jk,jb, d_view.extent(0), d_view.extent(1), d_view.extent(2)); + check_bounds(jc, jk, jb, d_view.extent(0), d_view.extent(1), d_view.extent(2)); d_view(jc, jk, jb) = p; - // printf("%f ", d_view(jb, jk, jc)); - }}); - Kokkos::fence(); - if(print) - printf("Time = %f ms\n\n", timer.seconds() * 1000); - Kokkos::deep_copy(view, d_view); - validate(array, nblocks, nlev, nproma); - + // printf("%f ", d_view(jb, jk, jc)); + } + }); + Kokkos::fence(); + if (print) printf("Time = %f ms\n\n", timer.seconds() * 1000); + Kokkos::deep_copy(view, d_view); + validate(array, nblocks, nlev, nproma); } void scenario_4b(double* array, int nblocks, int nlev, int nproma, bool print = true) { @@ -206,171 +210,132 @@ void scenario_4b(double* array, int nblocks, int nlev, int nproma, bool print = } #if defined(KOKKOS_ENABLE_CUDA) or defined(KOKKOS_ENABLE_HIP) - #define gpu 1 - using Layout = Kokkos::LayoutLeft; +#define gpu 1 +using Layout = Kokkos::LayoutLeft; #else - #undef gpu - using Layout = Kokkos::LayoutRight; +#undef gpu +using Layout = Kokkos::LayoutRight; #endif template <class T> struct LoopFunctor { - Kokkos::View<T**,Kokkos::LayoutStride> view; - int nproma, nlev, nblocks, jb; - - LoopFunctor(int nproma, int nlev, int nblocks, int jb, Kokkos::View<T***,Kokkos::LayoutStride> d_view) : - nproma(nproma), nlev(nlev), nblocks(nblocks), jb(jb) { - view = Kokkos::subview(d_view, Kokkos::ALL, Kokkos::ALL, jb); - } - - KOKKOS_INLINE_FUNCTION - void operator() (const int jc) const { - for (int jk = 0; jk < nlev; ++jk) { - #if defined(gpu) - int p = jb * nlev * nproma + jk * nproma + jc; - #else - int p = jc * nlev * nblocks + jk * nblocks + jb; - #endif - view(jc, jk) = p; - } + Kokkos::View<T**, Kokkos::LayoutStride> view; + int nproma, nlev, nblocks, jb; + + LoopFunctor(int nproma, int nlev, int nblocks, int jb, Kokkos::View<T***, Kokkos::LayoutStride> d_view) + : nproma(nproma), nlev(nlev), nblocks(nblocks), jb(jb) { + view = Kokkos::subview(d_view, Kokkos::ALL, Kokkos::ALL, jb); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const int jc) const { + for (int jk = 0; jk < nlev; ++jk) { +#if defined(gpu) + int p = jb * nlev * nproma + jk * nproma + jc; +#else + int p = jc * nlev * nblocks + jk * nblocks + jb; +#endif + view(jc, jk) = p; } + } }; -void scenario_5(double* array, int nblocks, int nlev, int nproma, bool print=true) { - if(print) - std::cout << "scenario 5: Adaptable layout & functor & subview (array, nproma, nlev);" << std::endl; - - Kokkos::View<double***, Layout, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks); - - using space_t = Kokkos::DefaultExecutionSpace::memory_space; - auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view); - - timer.reset(); - for (int jb = 0 ; jb < nblocks; ++jb) - Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) { - for (int jk = 0; jk < nlev; ++jk) { - int p = jc * nlev * nblocks + jb * nlev + jk; - d_view(jc, jb, jk) = p; +void scenario_5(double* array, int nblocks, int nlev, int nproma, bool print = true) { + if (print) std::cout << "scenario 5: Adaptable layout & functor & subview (array, nproma, nlev);" << std::endl; - // printf("%f ", d_view(jb, jk, jc)); - }}); + Kokkos::View<double***, Layout, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks); + + using space_t = Kokkos::DefaultExecutionSpace::memory_space; + auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view); + + timer.reset(); + for (int jb = 0; jb < nblocks; ++jb) + Kokkos::parallel_for( + "", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA(const int jc) { + for (int jk = 0; jk < nlev; ++jk) { + int p = jc * nlev * nblocks + jb * nlev + jk; + d_view(jc, jb, jk) = p; - Kokkos::fence(); - if(print) - printf("Time = %f ms\n\n", timer.seconds() * 1000); - Kokkos::deep_copy(view, d_view); - validate(array, nblocks, nlev, nproma); + // printf("%f ", d_view(jb, jk, jc)); + } + }); + Kokkos::fence(); + if (print) printf("Time = %f ms\n\n", timer.seconds() * 1000); + Kokkos::deep_copy(view, d_view); + validate(array, nblocks, nlev, nproma); } +void scenario_6(double* array, int nblocks, int nlev, int nproma, bool print = true) { + if (print) std::cout << "scenario 6: Adaptable Layout & Hierarchical parallelism" << std::endl; -void scenario_6(double* array, int nblocks, int nlev, int nproma, bool print=true) { + Kokkos::View<double***, Layout, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks); - if(print) - std::cout << "Default layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- " << std::endl; + using space_t = Kokkos::DefaultExecutionSpace::memory_space; + auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view); - Kokkos::View<double***, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nblocks, nlev, nproma); - - using space_t = Kokkos::DefaultExecutionSpace::memory_space; - auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view); - - timer.reset(); - Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nblocks), KOKKOS_LAMBDA (const int jb) { - //for (int jb = 0 ; jb < nblocks; ++jb) { - for (int jc = 0 ; jc < nproma; ++jc) { - for (int jk = 0; jk < nlev; ++jk) { + using team_policy = Kokkos::TeamPolicy<>; + using member_type = Kokkos::TeamPolicy<>::member_type; + + timer.reset(); + + Kokkos::parallel_for( + "blocks", team_policy(nblocks, Kokkos::AUTO), KOKKOS_LAMBDA(const member_type& teamMember) { + const int jb = teamMember.league_rank(); + + Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, nproma), [&](const int jc) { + // sequential over the levels + for (int jk = 0; jk < nlev; ++jk) { +#if defined(gpu) int p = jb * nlev * nproma + jk * nproma + jc; - check_bounds(jb,jk,jc, d_view.extent(0), d_view.extent(1), d_view.extent(2)); - d_view(jb, jk, jc) = p; - // printf("%f ", d_view(jb, jk, jc)); +#else + int p = jc * nlev * nblocks + jk * nblocks + jb; +#endif + + d_view(jc, jk, jb) = p; } - } + }); }); - if(print) - printf("Time = %f ms\n\n", timer.seconds() * 1000); - Kokkos::deep_copy(view, d_view); - validate(array, nblocks, nlev, nproma); -} - -void scenario_6(double* array, int nblocks, int nlev, int nproma, bool print=true) { - if(print) - std::cout << "scenario 6: Adaptable Layout & Hierarchical parallelism" << std::endl; - - Kokkos::View<double***, Layout, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks); - - using space_t = Kokkos::DefaultExecutionSpace::memory_space; - auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view); - - - using team_policy = Kokkos::TeamPolicy<>; - using member_type = Kokkos::TeamPolicy<>::member_type; - - timer.reset(); - - Kokkos::parallel_for("blocks", team_policy(nblocks, Kokkos::AUTO), - KOKKOS_LAMBDA (const member_type &teamMember) { - const int jb = teamMember.league_rank(); - - Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, nproma), - [&] (const int jc) { - - // sequential over the levels - for (int jk = 0; jk < nlev; ++jk) { - #if defined(gpu) - int p = jb * nlev * nproma + jk * nproma + jc; - #else - int p = jc * nlev * nblocks + jk * nblocks + jb; - #endif - - d_view(jc, jk, jb) = p; - } - }); - }); + Kokkos::fence(); - Kokkos::fence(); - - if(print) - printf("Time = %f ms\n\n", timer.seconds() * 1000); - Kokkos::deep_copy(view, d_view); - validate(array, nblocks, nlev, nproma); + if (print) printf("Time = %f ms\n\n", timer.seconds() * 1000); + Kokkos::deep_copy(view, d_view); + validate(array, nblocks, nlev, nproma); } - int main() { + int ncells = atoi(std::getenv("NCELLS")); + int nlev = atoi(std::getenv("NLEV")); + int nproma = atoi(std::getenv("NPROMA")); + int nblocks = (ncells - 1) / nproma + 1; + + std::cout << "nblocks=" << nblocks << ", nlev=" << nlev << ", nproma=" << nproma << std::endl; + + double array[nblocks * nlev * nproma]; + /* + for (int jb = 0; jb < nblocks; ++jb) + for (int jk = 0; jk < nlev; ++jk) + for (int jc = 0; jc < nproma; ++jc) { + int p = jb * nlev * nproma + jk * nproma + jc; + array[p] = 1; //static_cast<double>(p); + } + */ + /* + for (int i = 0; i < nblocks * nlev * nproma; ++i) + std::cout << array[i] << " " ; + std::cout << "\n"; + + + for (int jb = 0; jb < nblocks; ++jb) + for (int jk = 0; jk < nlev; ++jk){ + for (int jc = 0; jc < nproma; ++jc) + std::cout << view(jb, jk, jc)<< " "; + std::cout << "\n"; + } + */ - int ncells = atoi(std::getenv("NCELLS")); - int nlev = atoi(std::getenv("NLEV")); - int nproma = atoi(std::getenv("NPROMA")); - int nblocks = (ncells - 1) / nproma + 1; - - std::cout << "nblocks=" << nblocks << ", nlev=" << nlev << ", nproma=" << nproma << std::endl; - - double array[nblocks * nlev * nproma]; - /* - for (int jb = 0; jb < nblocks; ++jb) - for (int jk = 0; jk < nlev; ++jk) - for (int jc = 0; jc < nproma; ++jc) { - int p = jb * nlev * nproma + jk * nproma + jc; - array[p] = 1; //static_cast<double>(p); - } -*/ -/* - for (int i = 0; i < nblocks * nlev * nproma; ++i) - std::cout << array[i] << " " ; - std::cout << "\n"; - - - for (int jb = 0; jb < nblocks; ++jb) - for (int jk = 0; jk < nlev; ++jk){ - for (int jc = 0; jc < nproma; ++jc) - std::cout << view(jb, jk, jc)<< " "; - std::cout << "\n"; - } -*/ - - Kokkos::initialize(); -{ - + Kokkos::initialize(); + { scenario_1(array, nblocks, nlev, nproma, false); scenario_1(array, nblocks, nlev, nproma); @@ -381,26 +346,25 @@ int main() { scenario_4b(array, nblocks, nlev, nproma); scenario_5(array, nblocks, nlev, nproma); scenario_6(array, nblocks, nlev, nproma); + } + Kokkos::finalize(); -} - Kokkos::finalize(); - - return 0; + return 0; } /** - * + * * #if 0 Kokkos::parallel_for( "print", md_range_policy({0, 0, 0}, {nblocks, nlev, nproma}), KOKKOS_LAMBDA(const int jb, const int jk, const int jc) { int p = jb * nlev * nproma + jk * nproma + jc; d_view(jb, jk, jc) += p; - printf("%f ", d_view(jb, jk, jc)); + printf("%f ", d_view(jb, jk, jc)); }); std::cout << "\n"; #endif - for (int jb = 0 ; jb < nblocks; ++jb) + for (int jb = 0 ; jb < nblocks; ++jb) Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) { for (int jk = 0; jk < nlev; ++jk) { // int p = jb * nlev * nproma + jk * nproma + jc; left @@ -408,6 +372,6 @@ int main() { // d_view(jb, jk, jc) = p; d_view(jc, jk, jb) = p; - // printf("%f ", d_view(jb, jk, jc)); + // printf("%f ", d_view(jb, jk, jc)); }}); */ -- GitLab