diff --git a/CMakeLists.txt b/CMakeLists.txt index 413f35f3c197d1aeb0e4309de3e144656daf05cd..c4a491034ef4877dbb325ab219fc83f153d3e242 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,6 +12,8 @@ endif () # if using kokkos as shared library, -fPIC is needed set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC") +set(Kokkos_ENABLE_IMPL_MDSPAN OFF CACHE BOOL "Experimental mdspan support") + # configure kokkos 4.2 repository link FetchContent_Declare(kokkos URL https://github.com/kokkos/kokkos/releases/download/4.4.01/kokkos-4.4.01.tar.gz) diff --git a/main.cpp b/main.cpp index 2b5bbd12e9afdf7c33a891d9a06acbace836e4b5..bf48c9114770f370be23bb2286237d46404db8ee 100644 --- a/main.cpp +++ b/main.cpp @@ -1,272 +1,341 @@ -#include <iostream> #include <Kokkos_Core.hpp> -#include "Kokkos_Timer.hpp" #include <cassert> +#include <iostream> +#include <utility> + +#include "Kokkos_Timer.hpp" using space_t = Kokkos::DefaultExecutionSpace::memory_space; typedef Kokkos::MDRangePolicy<Kokkos::DefaultExecutionSpace, Kokkos::IndexType<int>, Kokkos::Rank<3>> md_range_policy; - Kokkos::Timer timer; - +Kokkos::Timer timer; -//constexpr int nblocks = 2; -//constexpr int nlev = 90; -//constexpr int nproma = 55000; +// constexpr int nblocks = 2; +// constexpr int nlev = 90; +// constexpr int nproma = 55000; -//#define ENABLE_CHECK_BOUNDS +// #define ENABLE_CHECK_BOUNDS static void validate(double* array, int nblocks, int nlev, int nproma) { - for (int i = 0; i < nblocks * nlev * nproma; ++i) { - assert(array[i] == static_cast<double>(i)); - } + for (int i = 0; i < nblocks * nlev * nproma; ++i) { + assert(array[i] == static_cast<double>(i)); + } } inline void check_bounds(int i1, int i2, int i3, int n1, int n2, int n3) { #ifdef ENABLE_CHECK_BOUNDS - assert(i1 >=0 && i2 >= 0 && i3 >= 0 && - i1 < n1 && i2 < n2 && i3 < n3); + assert(i1 >= 0 && i2 >= 0 && i3 >= 0 && i1 < n1 && i2 < n2 && i3 < n3); #endif } -void scenario_1(double* array, int nblocks, int nlev, int nproma, bool print=true) { - if(print) - std::cout << "Default layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- " << std::endl; +void scenario_1(double* array, int nblocks, int nlev, int nproma, bool print = true) { + if (print) + std::cout << "scenario 1: Default layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- " + << std::endl; - Kokkos::View<double***, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nblocks, nlev, nproma); + Kokkos::View<double***, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nblocks, nlev, nproma); - using space_t = Kokkos::DefaultExecutionSpace::memory_space; - auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view); + using space_t = Kokkos::DefaultExecutionSpace::memory_space; + auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view); - timer.reset(); + timer.reset(); - for (int jb = 0 ; jb < nblocks; ++jb) - Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) { - for (int jk = 0; jk < nlev; ++jk) { - int p = jb * nlev * nproma + jk * nproma + jc; - check_bounds(jb,jk,jc, d_view.extent(0), d_view.extent(1), d_view.extent(2)); + for (int jb = 0; jb < nblocks; ++jb) + Kokkos::parallel_for( + "", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA(const int jc) { + for (int jk = 0; jk < nlev; ++jk) { + int p = jb * nlev * nproma + jk * nproma + jc; + check_bounds(jb, jk, jc, d_view.extent(0), d_view.extent(1), d_view.extent(2)); d_view(jb, jk, jc) = p; - }}); - - Kokkos::fence(); - if(print) - printf("Time = %f ms\n\n", timer.seconds() * 1000); - Kokkos::deep_copy(view, d_view); + } + }); + Kokkos::fence(); + + Kokkos::fence(); + if (print) printf("Time = %f ms\n\n", timer.seconds() * 1000); + Kokkos::deep_copy(view, d_view); - validate(array, nblocks, nlev, nproma); + validate(array, nblocks, nlev, nproma); } -void scenario_2(double* array, int nblocks, int nlev, int nproma, bool print=true) { - if(print) - std::cout << "Right layout; view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- " << std::endl; +void scenario_2(double* array, int nblocks, int nlev, int nproma, bool print = true) { + if (print) + std::cout << "scenario 2: Right layout; view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- " << std::endl; - Kokkos::View<double***, Kokkos::LayoutRight, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks); - - using space_t = Kokkos::DefaultExecutionSpace::memory_space; - auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view); - - timer.reset(); + Kokkos::View<double***, Kokkos::LayoutRight, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, + nblocks); - for (int jb = 0 ; jb < nblocks; ++jb) - Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) { - for (int jk = 0; jk < nlev; ++jk) { + using space_t = Kokkos::DefaultExecutionSpace::memory_space; + auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view); + + timer.reset(); + + for (int jb = 0; jb < nblocks; ++jb) + Kokkos::parallel_for( + "", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA(const int jc) { + for (int jk = 0; jk < nlev; ++jk) { int p = jc * nlev * nblocks + jk * nblocks + jb; - check_bounds(jc,jk,jb, d_view.extent(0), d_view.extent(1), d_view.extent(2)); + check_bounds(jc, jk, jb, d_view.extent(0), d_view.extent(1), d_view.extent(2)); d_view(jc, jk, jb) = p; - // printf("%f ", d_view(jb, jk, jc)); - }}); - - Kokkos::fence(); - if(print) - printf("Time = %f ms\n\n", timer.seconds() * 1000); - Kokkos::deep_copy(view, d_view); - validate(array, nblocks, nlev, nproma); + // printf("%f ", d_view(jb, jk, jc)); + } + }); + Kokkos::fence(); + if (print) printf("Time = %f ms\n\n", timer.seconds() * 1000); + Kokkos::deep_copy(view, d_view); + validate(array, nblocks, nlev, nproma); } -void scenario_2b(double* array, int nblocks, int nlev, int nproma, bool print=true) { - if(print) - std::cout << "Right 2b layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- " << std::endl; +void scenario_2b(double* array, int nblocks, int nlev, int nproma, bool print = true) { + if (print) + std::cout << "scenario 2b: Right 2b layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- " + << std::endl; - Kokkos::View<double***, Kokkos::LayoutRight, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nblocks, nlev, nproma); - - using space_t = Kokkos::DefaultExecutionSpace::memory_space; - auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view); - - timer.reset(); + Kokkos::View<double***, Kokkos::LayoutRight, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nblocks, nlev, + nproma); - for (int jb = 0 ; jb < nblocks; ++jb) - Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) { - for (int jk = 0; jk < nlev; ++jk) { - int p = jb * nlev * nproma + jk * nproma + jc; - check_bounds(jb,jk,jc, d_view.extent(0), d_view.extent(1), d_view.extent(2)); + using space_t = Kokkos::DefaultExecutionSpace::memory_space; + auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view); + + timer.reset(); + + for (int jb = 0; jb < nblocks; ++jb) + Kokkos::parallel_for( + "", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA(const int jc) { + for (int jk = 0; jk < nlev; ++jk) { + int p = jb * nlev * nproma + jk * nproma + jc; + check_bounds(jb, jk, jc, d_view.extent(0), d_view.extent(1), d_view.extent(2)); d_view(jb, jk, jc) = p; - // printf("%f ", d_view(jb, jk, jc)); - }}); - - Kokkos::fence(); - if(print) - printf("Time = %f ms\n\n", timer.seconds() * 1000); - Kokkos::deep_copy(view, d_view); - validate(array, nblocks, nlev, nproma); + // printf("%f ", d_view(jb, jk, jc)); + } + }); + Kokkos::fence(); + if (print) printf("Time = %f ms\n\n", timer.seconds() * 1000); + Kokkos::deep_copy(view, d_view); + validate(array, nblocks, nlev, nproma); } // slow on CPU -void scenario_3(double* array, int nblocks, int nlev, int nproma, bool print=true) { - - if(print) - std::cout << "Left layout; view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- " << std::endl; - - Kokkos::View<double***, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks); - - using space_t = Kokkos::DefaultExecutionSpace::memory_space; - auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view); - - timer.reset(); - for (int jb = 0 ; jb < nblocks; ++jb) - Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) { - for (int jk = 0; jk < nlev; ++jk) { - int p = jb * nlev * nproma + jk * nproma + jc; - check_bounds(jc,jk,jb, d_view.extent(0), d_view.extent(1), d_view.extent(2)); +void scenario_3(double* array, int nblocks, int nlev, int nproma, bool print = true) { + if (print) + std::cout << "scenario 3: Left layout; view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- " << std::endl; + + Kokkos::View<double***, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, + nblocks); + + using space_t = Kokkos::DefaultExecutionSpace::memory_space; + auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view); + + timer.reset(); + for (int jb = 0; jb < nblocks; ++jb) + Kokkos::parallel_for( + "", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA(const int jc) { + for (int jk = 0; jk < nlev; ++jk) { + int p = jb * nlev * nproma + jk * nproma + jc; + check_bounds(jc, jk, jb, d_view.extent(0), d_view.extent(1), d_view.extent(2)); d_view(jc, jk, jb) = p; - // printf("%f ", d_view(jb, jk, jc)); - }}); - - Kokkos::fence(); - if(print) - printf("Time = %f ms\n\n", timer.seconds() * 1000); - Kokkos::deep_copy(view, d_view); - validate(array, nblocks, nlev, nproma); + // printf("%f ", d_view(jb, jk, jc)); + } + }); + Kokkos::fence(); + if (print) printf("Time = %f ms\n\n", timer.seconds() * 1000); + Kokkos::deep_copy(view, d_view); + validate(array, nblocks, nlev, nproma); } -void scenario_4(double* array, int nblocks, int nlev, int nproma, bool print=true) { +void scenario_4(double* array, int nblocks, int nlev, int nproma, bool print = true) { + if (print) + std::cout << "scenario 4: Default layout; view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- " + << std::endl; - if(print) - std::cout << "Default layout; view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- " << std::endl; + Kokkos::View<double***, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks); - Kokkos::View<double***, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks); - - using space_t = Kokkos::DefaultExecutionSpace::memory_space; - auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view); - - timer.reset(); - for (int jb = 0 ; jb < nblocks; ++jb) - Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) { - for (int jk = 0; jk < nlev; ++jk) { + using space_t = Kokkos::DefaultExecutionSpace::memory_space; + auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view); + + timer.reset(); + for (int jb = 0; jb < nblocks; ++jb) + Kokkos::parallel_for( + "", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA(const int jc) { + for (int jk = 0; jk < nlev; ++jk) { int p = jc * nlev * nblocks + jk * nblocks + jb; - check_bounds(jc,jk,jb, d_view.extent(0), d_view.extent(1), d_view.extent(2)); + check_bounds(jc, jk, jb, d_view.extent(0), d_view.extent(1), d_view.extent(2)); d_view(jc, jk, jb) = p; - // printf("%f ", d_view(jb, jk, jc)); - }}); + // printf("%f ", d_view(jb, jk, jc)); + } + }); + Kokkos::fence(); + if (print) printf("Time = %f ms\n\n", timer.seconds() * 1000); + Kokkos::deep_copy(view, d_view); + validate(array, nblocks, nlev, nproma); +} + +void scenario_4b(double* array, int nblocks, int nlev, int nproma, bool print = true) { + if (print) + std::cout << "scenario 4b (Dmitry's solution): view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- " + << std::endl; + + Kokkos::View<double***, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks); + + using space_t = Kokkos::DefaultExecutionSpace::memory_space; + auto d_view_tmp = Kokkos::create_mirror_view_and_copy(space_t(), view); + Kokkos::View<double***, space_t> d_view("d_view", nproma, nlev, nblocks); + Kokkos::deep_copy(d_view_tmp, view); + Kokkos::deep_copy(d_view, d_view_tmp); + + timer.reset(); + for (int jb = 0; jb < nblocks; ++jb) + Kokkos::parallel_for( + "", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA(const int jc) { + for (int jk = 0; jk < nlev; ++jk) { + int p = jc * nlev * nblocks + jk * nblocks + jb; + d_view(jc, jk, jb) = p; - Kokkos::fence(); - if(print) - printf("Time = %f ms\n\n", timer.seconds() * 1000); - Kokkos::deep_copy(view, d_view); - validate(array, nblocks, nlev, nproma); + // printf("%f ", d_view(jb, jk, jc)); + } + }); + Kokkos::fence(); + if (print) printf("Time = %f ms\n\n", timer.seconds() * 1000); + Kokkos::deep_copy(d_view_tmp, d_view); + Kokkos::deep_copy(view, d_view_tmp); + validate(array, nblocks, nlev, nproma); } +#if defined(KOKKOS_ENABLE_CUDA) or defined(KOKKOS_ENABLE_HIP) +#define gpu 1 +using Layout = Kokkos::LayoutLeft; +#else +#undef gpu +using Layout = Kokkos::LayoutRight; +#endif -void scenario_5(double* array, int nblocks, int nlev, int nproma, bool print=true) { +template <class T> struct LoopFunctor { + Kokkos::View<T**, Kokkos::LayoutStride> view; + int nproma, nlev, nblocks, jb; + + LoopFunctor(int nproma, int nlev, int nblocks, int jb, Kokkos::View<T***, Kokkos::LayoutStride> d_view) + : nproma(nproma), nlev(nlev), nblocks(nblocks), jb(jb) { + view = Kokkos::subview(d_view, Kokkos::ALL, Kokkos::ALL, jb); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const int jc) const { + for (int jk = 0; jk < nlev; ++jk) { +#if defined(gpu) + int p = jb * nlev * nproma + jk * nproma + jc; +#else + int p = jc * nlev * nblocks + jk * nblocks + jb; +#endif + view(jc, jk) = p; + } + } +}; - if(print) - std::cout << "Default layout; view(array, nproma, nblocks, nlev); d_view(jc, jb, jk) ----- KOMISCH" << std::endl; +void scenario_5(double* array, int nblocks, int nlev, int nproma, bool print = true) { + if (print) std::cout << "scenario 5: Adaptable layout & functor & subview (array, nproma, nlev);" << std::endl; - Kokkos::View<double***, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nblocks, nlev); - - using space_t = Kokkos::DefaultExecutionSpace::memory_space; - auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view); - - timer.reset(); - for (int jb = 0 ; jb < nblocks; ++jb) - Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) { - for (int jk = 0; jk < nlev; ++jk) { - int p = jc * nlev * nblocks + jb * nlev + jk; - d_view(jc, jb, jk) = p; + Kokkos::View<double***, Layout, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks); - // printf("%f ", d_view(jb, jk, jc)); - }}); + using space_t = Kokkos::DefaultExecutionSpace::memory_space; + auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view); - Kokkos::fence(); - if(print) - printf("Time = %f ms\n\n", timer.seconds() * 1000); - Kokkos::deep_copy(view, d_view); - validate(array, nblocks, nlev, nproma); + timer.reset(); + for (int jb = 0; jb < nblocks; ++jb) + Kokkos::parallel_for( + "", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA(const int jc) { + for (int jk = 0; jk < nlev; ++jk) { + int p = jc * nlev * nblocks + jb * nlev + jk; + d_view(jc, jb, jk) = p; + // printf("%f ", d_view(jb, jk, jc)); + } + }); + + Kokkos::fence(); + if (print) printf("Time = %f ms\n\n", timer.seconds() * 1000); + Kokkos::deep_copy(view, d_view); + validate(array, nblocks, nlev, nproma); } +void scenario_6(double* array, int nblocks, int nlev, int nproma, bool print = true) { + if (print) std::cout << "scenario 6: Adaptable Layout & Hierarchical parallelism" << std::endl; -void scenario_6(double* array, int nblocks, int nlev, int nproma, bool print=true) { + Kokkos::View<double***, Layout, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks); - if(print) - std::cout << "Default layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- " << std::endl; + using space_t = Kokkos::DefaultExecutionSpace::memory_space; + auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view); - Kokkos::View<double***, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nblocks, nlev, nproma); - - using space_t = Kokkos::DefaultExecutionSpace::memory_space; - auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view); - - timer.reset(); - Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nblocks), KOKKOS_LAMBDA (const int jb) { - //for (int jb = 0 ; jb < nblocks; ++jb) { - for (int jc = 0 ; jc < nproma; ++jc) { - for (int jk = 0; jk < nlev; ++jk) { + using team_policy = Kokkos::TeamPolicy<>; + using member_type = Kokkos::TeamPolicy<>::member_type; + + timer.reset(); + + Kokkos::parallel_for( + "blocks", team_policy(nblocks, Kokkos::AUTO), KOKKOS_LAMBDA(const member_type& teamMember) { + const int jb = teamMember.league_rank(); + + Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, nproma), [&](const int jc) { + // sequential over the levels + for (int jk = 0; jk < nlev; ++jk) { +#if defined(gpu) int p = jb * nlev * nproma + jk * nproma + jc; - check_bounds(jb,jk,jc, d_view.extent(0), d_view.extent(1), d_view.extent(2)); - d_view(jb, jk, jc) = p; - // printf("%f ", d_view(jb, jk, jc)); +#else + int p = jc * nlev * nblocks + jk * nblocks + jb; +#endif + + d_view(jc, jk, jb) = p; } - } + }); }); - if(print) - printf("Time = %f ms\n\n", timer.seconds() * 1000); - Kokkos::deep_copy(view, d_view); - validate(array, nblocks, nlev, nproma); -} + Kokkos::fence(); + if (print) printf("Time = %f ms\n\n", timer.seconds() * 1000); + Kokkos::deep_copy(view, d_view); + validate(array, nblocks, nlev, nproma); +} int main() { + int ncells = atoi(std::getenv("NCELLS")); + int nlev = atoi(std::getenv("NLEV")); + int nproma = atoi(std::getenv("NPROMA")); + int nblocks = (ncells - 1) / nproma + 1; + + std::cout << "nblocks=" << nblocks << ", nlev=" << nlev << ", nproma=" << nproma << std::endl; + + double array[nblocks * nlev * nproma]; + /* + for (int jb = 0; jb < nblocks; ++jb) + for (int jk = 0; jk < nlev; ++jk) + for (int jc = 0; jc < nproma; ++jc) { + int p = jb * nlev * nproma + jk * nproma + jc; + array[p] = 1; //static_cast<double>(p); + } + */ + /* + for (int i = 0; i < nblocks * nlev * nproma; ++i) + std::cout << array[i] << " " ; + std::cout << "\n"; + + + for (int jb = 0; jb < nblocks; ++jb) + for (int jk = 0; jk < nlev; ++jk){ + for (int jc = 0; jc < nproma; ++jc) + std::cout << view(jb, jk, jc)<< " "; + std::cout << "\n"; + } + */ - int ncells = atoi(std::getenv("NCELLS")); - int nlev = atoi(std::getenv("NLEV")); - int nproma = atoi(std::getenv("NPROMA")); - int nblocks = (ncells - 1) / nproma + 1; - - std::cout << "nblocks=" << nblocks << ", nlev=" << nlev << ", nproma=" << nproma << std::endl; - - double array[nblocks * nlev * nproma]; - /* - for (int jb = 0; jb < nblocks; ++jb) - for (int jk = 0; jk < nlev; ++jk) - for (int jc = 0; jc < nproma; ++jc) { - int p = jb * nlev * nproma + jk * nproma + jc; - array[p] = 1; //static_cast<double>(p); - } -*/ -/* - for (int i = 0; i < nblocks * nlev * nproma; ++i) - std::cout << array[i] << " " ; - std::cout << "\n"; - - - for (int jb = 0; jb < nblocks; ++jb) - for (int jk = 0; jk < nlev; ++jk){ - for (int jc = 0; jc < nproma; ++jc) - std::cout << view(jb, jk, jc)<< " "; - std::cout << "\n"; - } -*/ - - Kokkos::initialize(); -{ - + Kokkos::initialize(); + { scenario_1(array, nblocks, nlev, nproma, false); scenario_1(array, nblocks, nlev, nproma); @@ -274,28 +343,28 @@ int main() { scenario_2b(array, nblocks, nlev, nproma); scenario_3(array, nblocks, nlev, nproma); scenario_4(array, nblocks, nlev, nproma); + scenario_4b(array, nblocks, nlev, nproma); scenario_5(array, nblocks, nlev, nproma); scenario_6(array, nblocks, nlev, nproma); + } + Kokkos::finalize(); -} - Kokkos::finalize(); - - return 0; + return 0; } /** - * + * * #if 0 Kokkos::parallel_for( "print", md_range_policy({0, 0, 0}, {nblocks, nlev, nproma}), KOKKOS_LAMBDA(const int jb, const int jk, const int jc) { int p = jb * nlev * nproma + jk * nproma + jc; d_view(jb, jk, jc) += p; - printf("%f ", d_view(jb, jk, jc)); + printf("%f ", d_view(jb, jk, jc)); }); std::cout << "\n"; #endif - for (int jb = 0 ; jb < nblocks; ++jb) + for (int jb = 0 ; jb < nblocks; ++jb) Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) { for (int jk = 0; jk < nlev; ++jk) { // int p = jb * nlev * nproma + jk * nproma + jc; left @@ -303,6 +372,6 @@ int main() { // d_view(jb, jk, jc) = p; d_view(jc, jk, jb) = p; - // printf("%f ", d_view(jb, jk, jc)); + // printf("%f ", d_view(jb, jk, jc)); }}); */ diff --git a/script.sh b/script.sh index 43ae273b91817ca78789ca75d158fd92a368a908..d97fd6a3a2b24e78ce29b017dc6c6b3b809a898e 100755 --- a/script.sh +++ b/script.sh @@ -8,17 +8,21 @@ ulimit -s unlimited if [ "$1" == 'gpu' ] then + rm -rf build_gpu cmake -B build_gpu -S . -DMU_ARCH=a100 -DCMAKE_CXX_FLAGS="-O3" cmake --build build_gpu --parallel + ncells=(5000000) nlev=(90) nproma=(5000000) else + rm -rf build cmake -B build -S . -DMU_ARCH=x86_64 -DCMAKE_CXX_FLAGS="-O3" cmake --build build --parallel + ncells=(5000000) nlev=(90) - nproma=(32) # 64 96 128) + nproma=(32 64 96 128) export OMP_PROC_BIND=close export OMP_PLACES=cores