diff --git a/CMakeLists.txt b/CMakeLists.txt index 0903a9e003259cbd69cf50382621e00adc43d0d9..8dca0f4882ffed6d6e03f1a8affd070865fba39a 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -41,6 +41,7 @@ target_link_libraries(demo PUBLIC Kokkos::kokkos) if ("${MU_ARCH}" STREQUAL "a100") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -acc -Minfo=accel -gpu=cc80") + add_compile_definitions(DEMO_DEVICE) target_compile_options(demo PUBLIC "-gpu=pinned") target_link_options(demo PUBLIC "-gpu=pinned") endif() diff --git a/main.cpp b/main.cpp index d0b53ac5792a280bd6adda6fc04cea992f795781..f869a4b3b236f89bc346e8cce7338907077f3398 100644 --- a/main.cpp +++ b/main.cpp @@ -59,9 +59,8 @@ void scenario_1(double* array, int nblocks, int nlev, int nproma, bool print = t validate(array, nblocks, nlev, nproma); } -void scenario_1b(double* array, int nblocks, int nlev, int nproma, bool print = true) { - if (print) - std::cout << "scenario 1b: Default layout right; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- " +void scenario_1b(double* array, int nblocks, int nlev, int nproma) { + std::cout << "scenario 1b: Default layout right; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- " << std::endl; Kokkos::View<double***, Kokkos::LayoutRight, Kokkos::MemoryUnmanaged> d_view(array, nblocks, nlev, nproma); @@ -79,7 +78,7 @@ void scenario_1b(double* array, int nblocks, int nlev, int nproma, bool print = }); Kokkos::fence(); - if (print) printf("Time = %f ms\n\n", timer.seconds() * 1000); + printf("Time = %f ms\n\n", timer.seconds() * 1000); } @@ -235,12 +234,10 @@ void scenario_4b(double* array, int nblocks, int nlev, int nproma, bool print = validate(array, nblocks, nlev, nproma); } -#if defined(KOKKOS_ENABLE_CUDA) or defined(KOKKOS_ENABLE_HIP) -#define gpu 1 -using Layout = Kokkos::LayoutLeft; +#if defined(DEMO_DEVICE) + using Layout = Kokkos::LayoutLeft; #else -#undef gpu -using Layout = Kokkos::LayoutRight; + using Layout = Kokkos::LayoutRight; #endif template <class T> struct LoopFunctor { @@ -255,7 +252,7 @@ template <class T> struct LoopFunctor { KOKKOS_INLINE_FUNCTION void operator()(const int jc) const { for (int jk = 0; jk < nlev; ++jk) { -#if defined(gpu) +#if defined(DEMO_DEVICE) int p = jb * nlev * nproma + jk * nproma + jc; #else int p = jc * nlev * nblocks + jk * nblocks + jb; @@ -312,7 +309,7 @@ void scenario_6(double* array, int nblocks, int nlev, int nproma, bool print = t Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, nproma_tmp), [&](const int jc) { // sequential over the levels for (int jk = 0; jk < nlev; ++jk) { -#if defined(gpu) +#if defined(DEMO_DEVICE) int p = jb * nlev * nproma + jk * nproma + jc; #else int p = jc * nlev * nblocks + jk * nblocks + jb; @@ -361,10 +358,9 @@ void scenario_7(double* array, int nblocks, int nlev, int nproma, bool print=tru validate(array, nblocks, nlev, nproma); } -void scenario_7b(double* array, int nblocks, int nlev, int nproma, bool print=true) { +void scenario_7b(double* array, int nblocks, int nlev, int nproma) { - if(print) - std::cout << "scenario 7b: +ACC Default layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- " << std::endl; + std::cout << "scenario 7b: +ACC Default layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- " << std::endl; Kokkos::View<double***, Kokkos::MemoryUnmanaged> d_view(array, nblocks, nlev, nproma); @@ -374,7 +370,7 @@ void scenario_7b(double* array, int nblocks, int nlev, int nproma, bool print=tr for (int jc = 0 ; jc < nproma; ++jc) { for (int jk = 0; jk < nlev; ++jk) { -#if defined(gpu) +#if defined(DEMO_DEVICE) int p = jc * nlev * nblocks + jk * nblocks + jb; #else int p = jb * nlev * nproma + jk * nproma + jc; @@ -388,14 +384,11 @@ void scenario_7b(double* array, int nblocks, int nlev, int nproma, bool print=tr }); Kokkos::fence(); - - if(print) - printf("Time = %f ms\n\n", timer.seconds() * 1000); + printf("Time = %f ms\n\n", timer.seconds() * 1000); } -void scenario_8(double* array, int nblocks, int nlev, int nproma, bool print=true) { - if(print) - std::cout << "scenario 8: Default Layout as ICON pointer view(array, nproma, nlev, nblocks); d_view(jb, jk, jc) ----- " << std::endl; +void scenario_8(double* array, int nblocks, int nlev, int nproma) { + std::cout << "scenario 8: Default Layout as ICON pointer view(array, nproma, nlev, nblocks); d_view(jb, jk, jc) ----- " << std::endl; using space_t = Kokkos::DefaultExecutionSpace::memory_space; Kokkos::View<double***, Kokkos::MemoryUnmanaged> d_view(array, nproma, nlev, nblocks); @@ -408,7 +401,7 @@ void scenario_8(double* array, int nblocks, int nlev, int nproma, bool print=tru for (int jk = 0; jk < nlev; ++jk) { -#if defined(gpu) +#if defined(DEMO_DEVICE) int p = jb * nlev * nproma + jk * nproma + jc; #else int p = jc * nlev * nblocks + jk * nblocks + jb; @@ -421,11 +414,28 @@ void scenario_8(double* array, int nblocks, int nlev, int nproma, bool print=tru Kokkos::fence(); - if(print) - printf("Time = %f ms\n\n", timer.seconds() * 1000); + printf("Time = %f ms\n\n", timer.seconds() * 1000); } +void openacc_calls(double* array, int nblocks, int nlev, int nproma, + std::function<void(double*, int, int, int)> func){ + #if defined(DEMO_DEVICE) + #pragma acc enter data copyin(array[0:nblocks*nlev*nproma]) + { + #pragma acc host_data use_device(array) + { + func(array, nblocks, nlev, nproma); + } + } + #pragma acc exit data copyout(array[0:nblocks*nlev*nproma]) + #else + func(array, nblocks, nlev, nproma); + #endif + validate(array, nblocks, nlev, nproma); + +} + int main() { int ncells = atoi(std::getenv("NCELLS")); int nlev = atoi(std::getenv("NLEV")); @@ -435,48 +445,23 @@ int main() { std::cout << "nblocks=" << nblocks << ", nlev=" << nlev << ", nproma=" << nproma << std::endl; double array[nblocks * nlev * nproma]; - /* - for (int jb = 0; jb < nblocks; ++jb) - for (int jk = 0; jk < nlev; ++jk) - for (int jc = 0; jc < nproma; ++jc) { - int p = jb * nlev * nproma + jk * nproma + jc; - array[p] = 1; //static_cast<double>(p); - } - */ - /* - for (int i = 0; i < nblocks * nlev * nproma; ++i) - std::cout << array[i] << " " ; - std::cout << "\n"; - - - for (int jb = 0; jb < nblocks; ++jb) - for (int jk = 0; jk < nlev; ++jk){ - for (int jc = 0; jc < nproma; ++jc) - std::cout << view(jb, jk, jc)<< " "; - std::cout << "\n"; - } - */ - Kokkos::initialize(); { scenario_1(array, nblocks, nlev, nproma, false); - scenario_1(array, nblocks, nlev, nproma); + //scenario_1(array, nblocks, nlev, nproma); - #if defined(gpu) - #pragma acc enter data copyin(array[0:nblocks*nlev*nproma]) - { - #pragma acc host_data use_device(array) - { - scenario_1b(array, nblocks, nlev, nproma); - } - } - #pragma acc exit data copyout(array[0:nblocks*nlev*nproma]) - #else - scenario_1b(array, nblocks, nlev, nproma); - #endif - validate(array, nblocks, nlev, nproma); - + memset(array, 0.0, sizeof(array)); + std::function<void(double*, int, int, int)> s_1b = scenario_1b; + openacc_calls(array, nblocks, nlev, nproma, s_1b); + + memset(array, 0.0, sizeof(array)); + std::function<void(double*, int, int, int)> s_7b = scenario_7b; + openacc_calls(array, nblocks, nlev, nproma, s_7b); + + memset(array, 0.0, sizeof(array)); + std::function<void(double*, int, int, int)> s_8 = scenario_8; + openacc_calls(array, nblocks, nlev, nproma, s_8); /* scenario_2(array, nblocks, nlev, nproma); scenario_2b(array, nblocks, nlev, nproma); @@ -488,60 +473,8 @@ int main() { scenario_7(array, nblocks, nlev, nproma); */ - #if defined(gpu) - #pragma acc enter data copyin(array[0:nblocks*nlev*nproma]) - { - #pragma acc host_data use_device(array) - { - scenario_7b(array, nblocks, nlev, nproma); - } - } - #pragma acc exit data copyout(array[0:nblocks*nlev*nproma]) - #else - scenario_7b(array, nblocks, nlev, nproma); - #endif - validate(array, nblocks, nlev, nproma); - - #if defined(gpu) - #pragma acc enter data copyin(array[0:nblocks*nlev*nproma]) - { - #pragma acc host_data use_device(array) - { - scenario_8(array, nblocks, nlev, nproma); - } - } - #pragma acc exit data copyout(array[0:nblocks*nlev*nproma]) - #else - scenario_8(array, nblocks, nlev, nproma); - #endif - validate(array, nblocks, nlev, nproma); - } Kokkos::finalize(); return 0; -} - -/** - * - * #if 0 - Kokkos::parallel_for( - "print", md_range_policy({0, 0, 0}, {nblocks, nlev, nproma}), - KOKKOS_LAMBDA(const int jb, const int jk, const int jc) { - int p = jb * nlev * nproma + jk * nproma + jc; - d_view(jb, jk, jc) += p; - printf("%f ", d_view(jb, jk, jc)); - }); - std::cout << "\n"; -#endif - for (int jb = 0 ; jb < nblocks; ++jb) - Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) { - for (int jk = 0; jk < nlev; ++jk) { - // int p = jb * nlev * nproma + jk * nproma + jc; left - int p = jc * nlev * nblocks + jk * nblocks + jb; - // d_view(jb, jk, jc) = p; - d_view(jc, jk, jb) = p; - - // printf("%f ", d_view(jb, jk, jc)); - }}); -*/ +} \ No newline at end of file