diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100755 index 0000000000000000000000000000000000000000..413f35f3c197d1aeb0e4309de3e144656daf05cd --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,39 @@ +cmake_minimum_required(VERSION 3.20) +project(demo LANGUAGES CXX VERSION 0.0.1) + + +include(FetchContent) + +# silence warning +if (POLICY CMP0135) + cmake_policy(SET CMP0135 NEW) +endif () + +# if using kokkos as shared library, -fPIC is needed +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC") + +# configure kokkos 4.2 repository link +FetchContent_Declare(kokkos + URL https://github.com/kokkos/kokkos/releases/download/4.4.01/kokkos-4.4.01.tar.gz) + +if (("${MU_ARCH}" STREQUAL "x86_64") OR ("${MU_ARCH}" STREQUAL "arm")) + find_package(OpenMP) + if (OpenMP_FOUND) + set(Kokkos_ENABLE_OPENMP ON CACHE BOOL "" FORCE) + else() + set(Kokkos_ENABLE_SERIAL ON CACHE BOOL "" FORCE) + endif() + set(Kokkos_ARCH_NATIVE ON CACHE BOOL "" FORCE) +elseif("${MU_ARCH}" STREQUAL "a100") + set(Kokkos_ENABLE_SERIAL ON CACHE BOOL "" FORCE) + set(Kokkos_ENABLE_CUDA ON CACHE BOOL "" FORCE) + set(Kokkos_ENABLE_CUDA_LAMBDA ON CACHE BOOL "" FORCE) + set(Kokkos_ARCH_AMPERE80 ON CACHE BOOL "" FORCE) +else() + message(FATAL_ERROR "${MU_ARCH} is not a valid/tested configuration, select architecture, x86_64, a100, h100, mi250x, mi300a(-unified), h100(-unified)") +endif() + +FetchContent_MakeAvailable(kokkos) + +add_executable(demo main.cpp) +target_link_libraries(demo PUBLIC Kokkos::kokkos) diff --git a/main.cpp b/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c7d6f4231f05f46bf30ab0444f349f020f8a7aa5 --- /dev/null +++ b/main.cpp @@ -0,0 +1,227 @@ +#include <iostream> +#include <Kokkos_Core.hpp> +#include "Kokkos_Timer.hpp" + +using space_t = Kokkos::DefaultExecutionSpace::memory_space; +typedef Kokkos::MDRangePolicy<Kokkos::DefaultExecutionSpace, Kokkos::IndexType<int>, Kokkos::Rank<3>> md_range_policy; + + Kokkos::Timer timer; + + +//constexpr int nblocks = 2; +//constexpr int nlev = 90; +//constexpr int nproma = 55000; + +static void validate(double* array, int nblocks, int nlev, int nproma) { + for (int i = 0; i < nblocks * nlev * nproma; ++i) { + assert(array[i] == static_cast<double>(i)); + } +} + +void scenario_1(double* array, int nblocks, int nlev, int nproma, bool print=true) { + if(print) + std::cout << "Default layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- " << std::endl; + + Kokkos::View<double***, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nblocks, nlev, nproma); + + using space_t = Kokkos::DefaultExecutionSpace::memory_space; + auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view); + + timer.reset(); + + for (int jb = 0 ; jb < nblocks; ++jb) + Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) { + for (int jk = 0; jk < nlev; ++jk) { + int p = jb * nlev * nproma + jk * nproma + jc; + d_view(jb, jk, jc) = p; + }}); + + if(print) + printf("Time = %f ms\n\n", timer.seconds() * 1000); + Kokkos::deep_copy(view, d_view); + + validate(array, nblocks, nlev, nproma); +} + +void scenario_2(double* array, int nblocks, int nlev, int nproma, bool print=true) { + if(print) + std::cout << "Right layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- " << std::endl; + + Kokkos::View<double***, Kokkos::LayoutRight, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks); + + using space_t = Kokkos::DefaultExecutionSpace::memory_space; + auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view); + + timer.reset(); + + for (int jb = 0 ; jb < nblocks; ++jb) + Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) { + for (int jk = 0; jk < nlev; ++jk) { + int p = jc * nlev * nblocks + jk * nblocks + jb; + d_view(jc, jk, jb) = p; + + // printf("%f ", d_view(jb, jk, jc)); + }}); + + if(print) + printf("Time = %f ms\n\n", timer.seconds() * 1000); + Kokkos::deep_copy(view, d_view); + validate(array, nblocks, nlev, nproma); + +} + +void scenario_2b(double* array, int nblocks, int nlev, int nproma, bool print=true) { + if(print) + std::cout << "Right 2b layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- " << std::endl; + + Kokkos::View<double***, Kokkos::LayoutRight, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nblocks, nlev, nproma); + + using space_t = Kokkos::DefaultExecutionSpace::memory_space; + auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view); + + timer.reset(); + + for (int jb = 0 ; jb < nblocks; ++jb) + Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) { + for (int jk = 0; jk < nlev; ++jk) { + int p = jb * nlev * nproma + jk * nproma + jc; + d_view(jb, jk, jc) = p; + + // printf("%f ", d_view(jb, jk, jc)); + }}); + + if(print) + printf("Time = %f ms\n\n", timer.seconds() * 1000); + Kokkos::deep_copy(view, d_view); + validate(array, nblocks, nlev, nproma); + +} + +void scenario_3(double* array, int nblocks, int nlev, int nproma, bool print=true) { + + if(print) + std::cout << "Left layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- " << std::endl; + + Kokkos::View<double***, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks); + + using space_t = Kokkos::DefaultExecutionSpace::memory_space; + auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view); + + timer.reset(); + for (int jb = 0 ; jb < nblocks; ++jb) + Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) { + for (int jk = 0; jk < nlev; ++jk) { + int p = jb * nlev * nproma + jk * nproma + jc; + d_view(jc, jk, jb) = p; + + // printf("%f ", d_view(jb, jk, jc)); + }}); + + if(print) + printf("Time = %f ms\n\n", timer.seconds() * 1000); + Kokkos::deep_copy(view, d_view); + validate(array, nblocks, nlev, nproma); + +} + +void scenario_4(double* array, int nblocks, int nlev, int nproma, bool print=true) { + + if(print) + std::cout << "Default layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- " << std::endl; + + Kokkos::View<double***, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks); + + using space_t = Kokkos::DefaultExecutionSpace::memory_space; + auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view); + + timer.reset(); + for (int jb = 0 ; jb < nblocks; ++jb) + Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) { + for (int jk = 0; jk < nlev; ++jk) { + int p = jc * nlev * nblocks + jk * nblocks + jb; + d_view(jc, jk, jb) = p; + + // printf("%f ", d_view(jb, jk, jc)); + }}); + + if(print) + printf("Time = %f ms\n\n", timer.seconds() * 1000); + Kokkos::deep_copy(view, d_view); + validate(array, nblocks, nlev, nproma); + +} + + + +int main() { + + int ncells = atoi(std::getenv("NCELLS")); + int nlev = atoi(std::getenv("NLEV")); + int nproma = atoi(std::getenv("NPROMA")); + int nblocks = (ncells - 1) / nproma + 1; + + std::cout << "nblocks=" << nblocks << ", nlev=" << nlev << ", nproma=" << nproma << std::endl; + + double array[nblocks * nlev * nproma]; + /* + for (int jb = 0; jb < nblocks; ++jb) + for (int jk = 0; jk < nlev; ++jk) + for (int jc = 0; jc < nproma; ++jc) { + int p = jb * nlev * nproma + jk * nproma + jc; + array[p] = 1; //static_cast<double>(p); + } +*/ +/* + for (int i = 0; i < nblocks * nlev * nproma; ++i) + std::cout << array[i] << " " ; + std::cout << "\n"; + + + for (int jb = 0; jb < nblocks; ++jb) + for (int jk = 0; jk < nlev; ++jk){ + for (int jc = 0; jc < nproma; ++jc) + std::cout << view(jb, jk, jc)<< " "; + std::cout << "\n"; + } +*/ + + Kokkos::initialize(); +{ + + scenario_1(array, nblocks, nlev, nproma, false); + + scenario_1(array, nblocks, nlev, nproma); + scenario_2(array, nblocks, nlev, nproma); + scenario_2b(array, nblocks, nlev, nproma); + scenario_3(array, nblocks, nlev, nproma); + scenario_4(array, nblocks, nlev, nproma); + +} + Kokkos::finalize(); + + return 0; +} + +/** + * + * #if 0 + Kokkos::parallel_for( + "print", md_range_policy({0, 0, 0}, {nblocks, nlev, nproma}), + KOKKOS_LAMBDA(const int jb, const int jk, const int jc) { + int p = jb * nlev * nproma + jk * nproma + jc; + d_view(jb, jk, jc) += p; + printf("%f ", d_view(jb, jk, jc)); + }); + std::cout << "\n"; +#endif + for (int jb = 0 ; jb < nblocks; ++jb) + Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA (const int jc) { + for (int jk = 0; jk < nlev; ++jk) { + // int p = jb * nlev * nproma + jk * nproma + jc; left + int p = jc * nlev * nblocks + jk * nblocks + jb; + // d_view(jb, jk, jc) = p; + d_view(jc, jk, jb) = p; + + // printf("%f ", d_view(jb, jk, jc)); + }}); +*/ \ No newline at end of file diff --git a/script.sh b/script.sh new file mode 100755 index 0000000000000000000000000000000000000000..636792996cc0154654387fb7c42cb7b5046a30ae --- /dev/null +++ b/script.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +#gcc +#nvhpc/24.7-gcc-11.2.0 +#export LD_LIBRARY_PATH + +ulimit -s unlimited + +if [ "$1" == 'gpu' ] +then + cmake -B build_gpu -S . -DMU_ARCH=a100 -DCMAKE_CXX_FLAGS="-O3" + cmake --build build_gpu --parallel + ncells=(5000000) + nlev=(90) + nproma=(10000 30000 50000 100000 1000000 5000000) +else + cmake -B build -S . -DMU_ARCH=x86_64 -DCMAKE_CXX_FLAGS="-O3" + cmake --build build --parallel + ncells=(5000000) + nlev=(90) + nproma=(32) # 64 96 128) + + export OMP_PROC_BIND=close + export OMP_PLACES=cores + export OMP_NUM_THREADS=8 +fi + +for jb in ${ncells[*]}; do + for jk in ${nlev[*]}; do + for jc in ${nproma[*]}; do + export NPROMA=$jc + export NLEV=$jk + export NCELLS=$jb + if [ "$1" == 'gpu' ] + then + ./build_gpu/demo + else + ./build/demo + fi + done + done +done