diff --git a/CMakeLists.txt b/CMakeLists.txt index 8dca0f4882ffed6d6e03f1a8affd070865fba39a..536ebb55487f6c1dc7c0a1e9c0a2406065582671 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,5 @@ cmake_minimum_required(VERSION 3.20) -project(demo LANGUAGES CXX VERSION 0.0.1) +project(demo LANGUAGES CXX Fortran VERSION 0.0.1) include(FetchContent) @@ -30,15 +30,42 @@ elseif("${MU_ARCH}" STREQUAL "a100") set(Kokkos_ENABLE_SERIAL ON CACHE BOOL "" FORCE) set(Kokkos_ENABLE_CUDA ON CACHE BOOL "" FORCE) set(Kokkos_ARCH_AMPERE80 ON CACHE BOOL "" FORCE) +elseif("${MU_ARCH}" STREQUAL "mi250x") + set(Kokkos_ENABLE_SERIAL ON CACHE BOOL "" FORCE) + set(Kokkos_ENABLE_HIP ON CACHE BOOL "" FORCE) + set(Kokkos_ARCH_AMD_GFX90A ON CACHE BOOL "" FORCE) else() message(FATAL_ERROR "${MU_ARCH} is not a valid/tested configuration, select architecture, x86_64, a100, h100, mi250x, mi300a(-unified), h100(-unified)") endif() FetchContent_MakeAvailable(kokkos) -add_executable(demo main.cpp) +add_executable(demo cdriver.cpp main.cpp) target_link_libraries(demo PUBLIC Kokkos::kokkos) +set_source_files_properties( + fdriver.f90 + PROPERTIES Fortran_PREPROCESS ON +) + +if (Kokkos_ENABLE_CUDA OR Kokkos_ENABLE_HIP) + set(WITH_OPENACC_FORTRAN 1) +else() + set(WITH_OPENACC_FORTRAN 0) +endif() + +add_executable(fdemo fdriver.f90 main.cpp) +target_link_libraries(fdemo PUBLIC Kokkos::kokkos) +if (WITH_OPENACC_FORTRAN) + #find_package(OpenACC REQUIRED OpenACC_Fortran) + find_package(OpenACC) + if (NOT ${OpenACC_Fortran_FOUND}) + message(FATAL_ERROR "OpenACC_Fortran_FOUND=${OpenACC_Fortran_FOUND}") + endif() + target_link_libraries(fdemo PUBLIC OpenACC::OpenACC_Fortran ) +endif() +set_property(TARGET fdemo PROPERTY LINKER_LANGUAGE Fortran) + if ("${MU_ARCH}" STREQUAL "a100") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -acc -Minfo=accel -gpu=cc80") add_compile_definitions(DEMO_DEVICE) @@ -46,3 +73,7 @@ if ("${MU_ARCH}" STREQUAL "a100") target_link_options(demo PUBLIC "-gpu=pinned") endif() +if ("${MU_ARCH}" STREQUAL "mi250x") + add_compile_definitions(DEMO_DEVICE) +endif() + diff --git a/cdriver.cpp b/cdriver.cpp new file mode 100644 index 0000000000000000000000000000000000000000..beac2f23b91cffa788c7c9143c3b82bc0fab717f --- /dev/null +++ b/cdriver.cpp @@ -0,0 +1,11 @@ +#include <Kokkos_Core.hpp> + +void cpp_run_tests(); + +int main() { + Kokkos::initialize(); + cpp_run_tests(); + Kokkos::finalize(); + return 0; +} + diff --git a/conf.sh b/conf.sh new file mode 100755 index 0000000000000000000000000000000000000000..b81fc040face5968bbbbd39a62b09da7488bdfe7 --- /dev/null +++ b/conf.sh @@ -0,0 +1,16 @@ +# source this + +case $(hostname) in + levante*.dkrz.de|vader*.dkrz.de) + export DEVICE=a100 + ;; + nid*) # lumi-g + export DEVICE=mi250x + ;; + *) + echo "unknown system: $(hostname)" + exit 1 + ;; +esac + +echo "$0: DEVICE=$DEVICE" diff --git a/debug_script.sh b/debug_script.sh index b2200668469b472497b9bd8c4f7be505c3e802a2..73a621d605de024afff29c5fe74a7264dd461806 100755 --- a/debug_script.sh +++ b/debug_script.sh @@ -8,10 +8,15 @@ ulimit -s unlimited set -e +# set DEVICE: +source conf.sh + +export VERBOSE_LEVEL=2 + if [ "$1" == 'gpu' ] then #rm -rf build_gpu - cmake -B build_gpu -S . -DMU_ARCH=a100 -DCMAKE_CXX_FLAGS="-O0 -g" + cmake -B build_gpu -S . -DMU_ARCH=$DEVICE -DCMAKE_CXX_FLAGS="-O0 -g" cmake --build build_gpu --parallel ncells=(8) @@ -28,7 +33,7 @@ else export OMP_PROC_BIND=close export OMP_PLACES=cores - export OMP_NUM_THREADS=8 + export OMP_NUM_THREADS=1 fi for jb in ${ncells[*]}; do @@ -39,9 +44,11 @@ for jb in ${ncells[*]}; do export NCELLS=$jb if [ "$1" == 'gpu' ] then - ./build_gpu/demo + #./build_gpu/demo + ./build_gpu/fdemo else - ./build/demo + #./build/demo + ./build/fdemo fi echo "---" done diff --git a/fdriver.f90 b/fdriver.f90 new file mode 100644 index 0000000000000000000000000000000000000000..d4b93c8dcb95d1582ed5f66be476e73af0071b74 --- /dev/null +++ b/fdriver.f90 @@ -0,0 +1,177 @@ +MODULE fsupport + USE, INTRINSIC:: iso_c_binding +#ifdef _OPENACC + USE openacc +#endif + IMPLICIT NONE + PRIVATE + PUBLIC :: dp + PUBLIC :: t_domain + PUBLIC :: get_domain + PUBLIC :: init_field + PUBLIC :: zero_field + PUBLIC :: physop + PUBLIC :: f2c_default_run_all + !PUBLIC :: show_field + PUBLIC :: f2c_kokkos_initialize + PUBLIC :: f2c_kokkos_finalize + INTEGER, PARAMETER :: dp = c_double + + TYPE, BIND(c):: t_domain + INTEGER(c_int) :: ncells + INTEGER(c_int) :: nlev + INTEGER(c_int) :: nblocks + INTEGER(c_int) :: nproma + END TYPE t_domain + + INTERFACE + SUBROUTINE f2c_kokkos_initialize() BIND(c) + END SUBROUTINE f2c_kokkos_initialize + + SUBROUTINE f2c_kokkos_finalize() BIND(c) + END SUBROUTINE f2c_kokkos_finalize + + SUBROUTINE f2c_get_domain(dom) BIND(c) + IMPORT t_domain + TYPE(t_domain) :: dom + END SUBROUTINE f2c_get_domain + + SUBROUTINE f2c_default_run_all(field, ref_in, ref_out, nblocks, nlev, nproma) BIND(c) + IMPORT c_double + REAL(c_double) :: field(*), ref_in(*), ref_out(*) + INTEGER, VALUE :: nblocks, nlev, nproma + END SUBROUTINE f2c_default_run_all + END INTERFACE + +CONTAINS + + SUBROUTINE get_domain(dom) + TYPE(t_domain), INTENT(out) :: dom + CALL f2c_get_domain(dom) + END SUBROUTINE get_domain + + SUBROUTINE init_field(field) + REAL(dp), INTENT(out) :: field(:,:,:) + INTEGER :: j,k,i + INTEGER :: nj,nk,ni + nj = SIZE(field,3) + nk = SIZE(field,2) + ni = SIZE(field,1) + !PRINT*,'ni,nk,nj=',ni,nk,nj + DO j = 1, nj +!$ACC parallel default(present) +!$ACC LOOP GANG VECTOR COLLAPSE(2) + DO k = 1, nk + DO i = 1, ni + field(i,k,j) = REAL(i-1 + (k-1)*ni + (j-1)*ni*nk, dp) + !PRINT*,'i,k,j,val=',i,k,j, field(i,k,j) + ENDDO + ENDDO +!$ACC END PARALLEL + ENDDO + END SUBROUTINE init_field + + SUBROUTINE zero_field(v) + REAL(dp), INTENT(out) :: v(:,:,:) + INTEGER :: j,k,i + INTEGER :: nj,nk,ni + nj = SIZE(v,3) + nk = SIZE(v,2) + ni = SIZE(v,1) + DO j = 1, nj +!$ACC parallel default(present) +!$ACC LOOP GANG VECTOR COLLAPSE(2) + DO k = 1, nk + DO i = 1, ni + v(i,k,j) = 0.0_dp + ENDDO + ENDDO +!$ACC END PARALLEL + ENDDO + END SUBROUTINE zero_field + + SUBROUTINE physop(v) + REAL(dp), INTENT(out) :: v(:,:,:) + INTEGER :: j,k,i + INTEGER :: nj,nk,ni + nj = SIZE(v,3) + nk = SIZE(v,2) + ni = SIZE(v,1) + PRINT*,'physop: ni,nk,nj=',ni,nk,nj + DO j = 1, nj +!$ACC PARALLEL DEFAULT(PRESENT) + k = 1 +!$ACC LOOP GANG VECTOR + DO i = 1, ni + v(i,k,j) = v(i,k,j) + v(i,k+1,j)/10000 + ENDDO +!$ACC LOOP SEQ + DO k = 2, nk-1 +!$ACC LOOP GANG VECTOR + DO i = 1, ni + v(i,k,j) = v(i,k,j) + v(i,k-1,j)/100 + v(i,k+1,j)/10000; + ENDDO + ENDDO + k = nk +!$ACC LOOP GANG VECTOR + DO i = 1, ni + v(i,k,j) = v(i,k,j) + v(i,k-1,j)/100; + ENDDO +!$ACC END PARALLEL + ENDDO + END SUBROUTINE physop + +#if 0 + SUBROUTINE show_field(label,f) + CHARACTER(len=*), INTENT(in) :: label + REAL(dp), INTENT(in) :: f(:,:,:) + INTEGER :: i,k,j + INTEGER :: ni,nk,nj + nj = SIZE(f,1) + nk = SIZE(f,2) + ni = SIZE(f,3) + DO j = 1, nj +!$ACC PARALLEL DEFAULT(PRESENT) +!$ACC LOOP SEQ + DO k = 1, nk + DO i = 1, ni + PRINT '(A,3I8,F)','show_field: '//label,j,k,i,f(i,k,j) + ENDDO + ENDDO +!$ACC END PARALLEL + ENDDO + END SUBROUTINE show_field +#endif +END MODULE fsupport + +PROGRAM fdriver + USE fsupport, ONLY: dp, t_domain, get_domain, init_field, zero_field, physop, & + & f2c_default_run_all, f2c_kokkos_initialize, f2c_kokkos_finalize + IMPLICIT NONE + + TYPE(t_domain) :: dom + REAL(dp), ALLOCATABLE :: field(:,:,:), ref_in(:,:,:), ref_out(:,:,:) + CALL f2c_kokkos_initialize() + PRINT*,'fdriver::start' + CALL get_domain(dom) + PRINT '(A,4I8)','ncells,lev,nblocks,nproma=',dom%ncells,dom%nlev,dom%nblocks,dom%nproma + + ALLOCATE( field(dom%nproma,dom%nlev,dom%nblocks), & + & ref_in(dom%nproma,dom%nlev,dom%nblocks), & + & ref_out(dom%nproma,dom%nlev,dom%nblocks) ) + +!$acc enter data create(field, ref_in, ref_out) + CALL zero_field(field) + CALL init_field(ref_in) + CALL init_field(ref_out) + CALL physop(ref_out) + +!$acc host_data use_device(field, ref_in, ref_out) + CALL f2c_default_run_all(field, ref_in, ref_out, dom%nblocks, dom%nlev, dom%nproma) +!$acc end host_data + +!$acc exit data copyout(field, ref_in, ref_out) + + CALL f2c_kokkos_finalize() + PRINT*,'fdriver:end' +END PROGRAM fdriver diff --git a/lumi-cpu-job.sh b/lumi-cpu-job.sh new file mode 100644 index 0000000000000000000000000000000000000000..dbc60acb1ff6cd2387d833ef5cef689d52582ecd --- /dev/null +++ b/lumi-cpu-job.sh @@ -0,0 +1,13 @@ +#!/bin/bash +#SBATCH --job-name=demo_cpu +#SBATCH --output=demo_cpu.o%j +#SBATCH --error=demo_cpu.o%j +#SBATCH --partition=dev-g +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=8 +#SBATCH --time=00:10:00 +#SBATCH --account=project_465001141 + +export VERBOSE=1 +#./debug_script.sh +./script.sh diff --git a/lumi-gpu-job.sh b/lumi-gpu-job.sh new file mode 100755 index 0000000000000000000000000000000000000000..9e98eb954132a085bc787ffc971382765b7edfd4 --- /dev/null +++ b/lumi-gpu-job.sh @@ -0,0 +1,14 @@ +#!/bin/bash +#SBATCH --job-name=demo_gpu # Job name +#SBATCH --output=demo_gpu.o%j # Name of stdout output file +#SBATCH --error=demo_gpu.o%j # Name of stderr error file +#SBATCH --partition=dev-g # partition name : standard-g +#SBATCH --gpus=1 +#SBATCH --time=00:10:00 # Run time (d-hh:mm:ss) +#SBATCH --account=project_465001141 # Project for billing + + +pwd + +#./debug_script.sh gpu +./script.sh gpu diff --git a/lumi_cpu_setup.sh b/lumi_cpu_setup.sh new file mode 100644 index 0000000000000000000000000000000000000000..f70aaf4bbe0adfab3ed6298eb30bb7d4cd8e249d --- /dev/null +++ b/lumi_cpu_setup.sh @@ -0,0 +1,22 @@ +# source this +module load LUMI/24.03 +module load buildtools/24.03 +module load googletest/1.14.0-cpeCray-24.03 + + + +module load PrgEnv-cray +export LD_LIBRARY_PATH=${CRAY_LD_LIBRARY_PATH}:$LD_LIBRARY_PATH +echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" + +# load GPU support +#module load craype-accel-amd-gfx90a +module load rocm +module load cce/17.0.1 craype-x86-milan + +# new: python with jinja2 +#source ~/venv/bin/activate + +#export CRAYPE_LINK_TYPE=dynamic +export CXX=CC +export FC=ftn diff --git a/lumi_gpu_setup.sh b/lumi_gpu_setup.sh new file mode 100644 index 0000000000000000000000000000000000000000..2e6c6e6fc617af3d80a23a221e08b2ebc119ce69 --- /dev/null +++ b/lumi_gpu_setup.sh @@ -0,0 +1,20 @@ +# source this +module load LUMI/24.03 +module load buildtools/24.03 +module load googletest/1.14.0-cpeCray-24.03 + + +module load PrgEnv-cray +export LD_LIBRARY_PATH=${CRAY_LD_LIBRARY_PATH}:$LD_LIBRARY_PATH +echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" + +# load GPU support +module load craype-accel-amd-gfx90a +module load rocm + +# new: python with jinja2 +source ~/venv/bin/activate + +export CRAYPE_LINK_TYPE=dynamic +export CXX=CC +export FC=ftn diff --git a/main.cpp b/main.cpp index eaa71b98ad2b38e86cf39254017a79b5cb5507cf..0326de323dd76658cacf8727bf70cee3bfba1b4d 100644 --- a/main.cpp +++ b/main.cpp @@ -1,5 +1,6 @@ #include <Kokkos_Core.hpp> #include <functional> +#undef NDEBUG // always use assertion #include <cassert> #include <iostream> #include <utility> @@ -13,15 +14,10 @@ using space_t = Kokkos::DefaultExecutionSpace::memory_space; typedef Kokkos::MDRangePolicy<Kokkos::DefaultExecutionSpace, Kokkos::IndexType<int>, Kokkos::Rank<3>> md_range_policy; Kokkos::Timer timer; -//Kokkos::Timer transposition_timer; -//Kokkos::Timer merge_horizontal_timer; - -// constexpr int nblocks = 2; -// constexpr int nlev = 90; -// constexpr int nproma = 55000; #define ENABLE_CHECK_BOUNDS +static int verbose_level = 0; struct AllLevels { constexpr static int value = 0; @@ -50,6 +46,27 @@ static void validate(double* array, const double *ref, int nblocks, int nlev, in } } + +inline static HOST_DEVICE_ATTRIBUTES double ferror(double x1, double x0) { + constexpr double eps = 1.e-14; + if (fabs(x0) > eps) { + return fabs(x1-x0)/fabs(x0); + } else { + return fabs(x1-x0); + } +} + +static void validate_max_error(double* array, const double *ref, int nblocks, int nlev, int nproma) { + const int ntot = nblocks*nlev*nproma; + double totmax; + Kokkos::parallel_reduce("validator", ntot, KOKKOS_LAMBDA(const int &i, double &emax) { + double err = ferror(array[i],ref[i]); + emax = fmax(emax, err); + }, Kokkos::Max<double>(totmax)); + if (verbose_level > 0) printf("validate_max_error: total max error=%.10e\n",totmax); + assert(totmax < 1.e-14); +} + KOKKOS_INLINE_FUNCTION void check_bounds(int i0, int i1, int i2, int n0, int n1, int n2) { #ifdef ENABLE_CHECK_BOUNDS assert(i0 >= 0 && i1 >= 0 && i2 >= 0 && i0 < n0 && i1 < n1 && i2 < n2); @@ -62,37 +79,6 @@ KOKKOS_INLINE_FUNCTION void check_bounds(int i0, int i1, int n0, int n1) { #endif } -template<typename ViewType> -void show_view(const std::string &label, const ViewType &view, const bool with_values=false) { - printf("show_view: label=%s, ",label.c_str()); - using MyLayout = typename ViewType::array_layout; - if (std::is_same<MyLayout, Kokkos::LayoutLeft>::value) { - printf("Layout=LayoutLeft\n"); - } else if (std::is_same<MyLayout, Kokkos::LayoutRight>::value) { - printf("Layout=LayoutRight\n"); - } else { - printf("Layout: **unknown**\n"); - } - if (with_values) { -#ifdef DEMO_DEVICE - auto h_view = Kokkos::create_mirror_view_and_copy(Kokkos::Serial(),view); -#else - auto &h_view = view; -#endif - const int n0 = h_view.extent(0); - const int n1 = h_view.extent(1); - const int n2 = h_view.extent(2); - printf("n0=%d, n1=%d, n2=%d\n",n0,n1,n2); - for (int i0=0; i0 < n0; ++i0) { - for (int i2=0; i2 < n2; ++i2) { - for (int i1=0; i1 < n1; ++i1) { - printf("show_view: i0=%d, i1=%d, i2=%d, v=%f\n",i0,i1,i2, h_view(i0,i1,i2)); - } - }; - }; - } - Kokkos::fence(); -} template<typename ViewType> inline HOST_DEVICE_ATTRIBUTES void update_column(ViewType &v) { @@ -125,7 +111,6 @@ void scenario_1(double* array, int nblocks, int nlev, int nproma, bool print = t auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view); timer.reset(); - for (int jb = 0; jb < nblocks; ++jb) Kokkos::parallel_for( "", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA(const int jc) { @@ -133,18 +118,20 @@ void scenario_1(double* array, int nblocks, int nlev, int nproma, bool print = t }); Kokkos::fence(); - Kokkos::fence(); if (print) printf("Time = %f ms\n\n", timer.seconds() * 1000); Kokkos::deep_copy(view, d_view); - //validate(array, nblocks, nlev, nproma); +} + +void scenario_1_noprint(double* array, int nblocks, int nlev, int nproma) { + printf("scenario_1_noprint\n"); + scenario_1(array, nblocks, nlev, nproma, false); } void scenario_1b(double* array, int nblocks, int nlev, int nproma) { std::cout << "scenario 1b: always LayoutRight; view(array, nblocks, nlev, nproma); parallel: nproma ----- " - << std::endl; - + << std::endl; Kokkos::View<double***, Kokkos::LayoutRight, Kokkos::MemoryUnmanaged> d_view(array, nblocks, nlev, nproma); timer.reset(); @@ -170,6 +157,8 @@ void scenario_1c(double* array, int nblocks, int nlev, int nproma) { //Kokkos::View<double**, Kokkos::MemoryUnmanaged> d_view(array, ncells, nlev); Kokkos::View<double**> d_view2d("d_view2d", ncells, nlev); Kokkos::Timer tr_timer; + Kokkos::Timer total_timer; + total_timer.reset(); // view3d -> view2d: tr_timer.reset(); @@ -182,7 +171,7 @@ void scenario_1c(double* array, int nblocks, int nlev, int nproma) { }); }); Kokkos::fence(); - printf("Time transposition 3d -> 2d= %f ms\n", tr_timer.seconds() * 1000); + printf("Transposition 3d -> 2d = %f ms\n", tr_timer.seconds() * 1000); timer.reset(); Kokkos::parallel_for( @@ -192,7 +181,7 @@ void scenario_1c(double* array, int nblocks, int nlev, int nproma) { update_column(column); }); Kokkos::fence(); - printf("Time = %f ms\n", timer.seconds() * 1000); + printf("Kernel = %f ms\n", timer.seconds() * 1000); // view2d -> view3d: tr_timer.reset(); @@ -205,7 +194,9 @@ void scenario_1c(double* array, int nblocks, int nlev, int nproma) { }); }); Kokkos::fence(); - printf("Time transposition 2d -> 3d= %f ms\n\n", tr_timer.seconds() * 1000); + printf("Transposition 2d -> 3d = %f ms\n", tr_timer.seconds() * 1000); + + printf("Time = %f ms\n\n", total_timer.seconds() * 1000); } @@ -223,6 +214,8 @@ void scenario_1cmacro(double* array, int nblocks, int nlev, int nproma) { Kokkos::View<double***, Kokkos::LayoutLeft, Kokkos::MemoryUnmanaged> d_view3d(array, nproma, nlev, nblocks); Kokkos::Timer tr_timer; + Kokkos::Timer total_timer; + total_timer.reset(); // view3d -> view2d: tr_timer.reset(); @@ -235,7 +228,7 @@ void scenario_1cmacro(double* array, int nblocks, int nlev, int nproma) { }); }); Kokkos::fence(); - printf("Time transposition 3d -> 2d= %f ms\n", tr_timer.seconds() * 1000); + printf("Transposition 3d -> 2d = %f ms\n", tr_timer.seconds() * 1000); timer.reset(); @@ -247,7 +240,7 @@ void scenario_1cmacro(double* array, int nblocks, int nlev, int nproma) { }); Kokkos::fence(); - printf("Time = %f ms\n", timer.seconds() * 1000); + printf("Kernel = %f ms\n", timer.seconds() * 1000); // view2d -> view3d: tr_timer.reset(); @@ -260,7 +253,8 @@ void scenario_1cmacro(double* array, int nblocks, int nlev, int nproma) { }); }); Kokkos::fence(); - printf("Time transposition 2d -> 3d= %f ms\n\n", tr_timer.seconds() * 1000); + printf("Transposition 2d -> 3d = %f ms\n", tr_timer.seconds() * 1000); + printf("Time = %f ms\n\n", total_timer.seconds() * 1000); } @@ -580,8 +574,7 @@ void scenario_7(double* array, int nblocks, int nlev, int nproma, bool print=tru #endif void scenario_7b(double* array, int nblocks, int nlev, int nproma) { - - std::cout << "scenario 7b: +ACC modifiedDefault layout; CPUview(array, blocks, nlev, nproma); GPUview(nproma, nlev, nblocks), parallel blocks ----- " << std::endl; + std::cout << "scenario 7b: +ACC Default layout; CPUview(array, blocks, nlev, nproma); GPUview(nproma, nlev, nblocks), parallel blocks ----- " << std::endl; #ifdef DEMO_DEVICE Kokkos::View<double***, Kokkos::MemoryUnmanaged> d_view(array, nproma, nlev, nblocks); #else @@ -606,14 +599,13 @@ void scenario_7b(double* array, int nblocks, int nlev, int nproma) { void scenario_7bmacro(double* array, int nblocks, int nlev, int nproma) { - std::cout << "scenario 7bmacro: +ACC CPU:LayoutLeft GPU:LayoutRight; CPUview(array, nblocks, nlev, nproma); GPUview(nproma, nlev, nblocks) parallel: asICON----- " << std::endl; + std::cout << "scenario 7bmacro: +ACC CPU:LayoutLeft GPU:LayoutRight; CPUview(array, nblocks, nlev, nproma); GPUview(nproma, nlev, nblocks) para: asICON" << std::endl; #if defined(DEMO_DEVICE) Kokkos::View<double***, Kokkos::LayoutLeft, Kokkos::MemoryUnmanaged> d_view(array, nproma, nlev, nblocks); #else Kokkos::View<double***, Kokkos::LayoutRight, Kokkos::MemoryUnmanaged> d_view(array, nblocks, nlev, nproma); #endif - show_view("7bmacro",d_view); timer.reset(); outer_for("", 0, nblocks, outer_lambda(const int jb){ inner_for("",0, nproma, inner_lambda(const int jc) { @@ -649,7 +641,6 @@ void scenario_8(double* array, int nblocks, int nlev, int nproma) { std::cout << "scenario 8: +ACC always LayoutLeft view(array, nproma, nlev, nblocks) parallel: nproma ----- " << std::endl; Kokkos::View<double***, Kokkos::LayoutLeft, Kokkos::MemoryUnmanaged> d_view(array, nproma, nlev, nblocks) ; - show_view("scenario_8", d_view); timer.reset(); for (int jb = 0; jb < nblocks; ++jb) Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA(const int jc) { @@ -662,7 +653,7 @@ void scenario_8(double* array, int nblocks, int nlev, int nproma) { void scenario_8macro(double* array, int nblocks, int nlev, int nproma) { - std::cout << "scenario 8macro: +ACC always LayoutLeftm, view(array, nproma, nlev, nblocks); parallel: asICON ----- " << std::endl; + std::cout << "scenario 8macro: +ACC always LayoutLeft, view(array, nproma, nlev, nblocks); parallel: asICON ----- " << std::endl; using space_t = Kokkos::DefaultExecutionSpace::memory_space; Kokkos::View<double***, Kokkos::LayoutLeft, Kokkos::MemoryUnmanaged> d_view(array, nproma, nlev, nblocks); timer.reset(); @@ -733,8 +724,47 @@ void physop(double *array, int nproma, int nlev, int nblocks) { } } +bool is_device_ptr(const void* ptr) { + using ExecSpace = Kokkos::DefaultExecutionSpace; +#ifdef KOKKOS_ENABLE_CUDA + constexpr bool exec_on_cuda = std::is_same<ExecSpace,Kokkos::Cuda>::value; + if constexpr (exec_on_cuda) { + cudaPointerAttributes att; + auto err = cudaPointerGetAttributes (&att, ptr); + assert(err == cudaSuccess); + if (verbose_level > 2 ) { + printf("is_device_ptr: device=%d\n",att.device); + printf("is_device_ptr: memtype=%d\n",att.type ); + printf("is_device_ptr: ptr=%p\n",ptr); + printf("is_device_ptr: d_ptr=%p\n",att.devicePointer); + printf("is_device_ptr: h_ptr=%p\n",att.hostPointer); + } + if (att.devicePointer && ptr == att.devicePointer ) { + return true; + } else { + return false; + } + } +#endif +#ifdef KOKKOS_ENABLE_HIP + constexpr bool exec_on_hip = std::is_same<ExecSpace,Kokkos::HIP>::value; + if constexpr (exec_on_hip) { + hipPointerAttribute_t att; + auto err = hipPointerGetAttributes(&att, ptr); + assert(err == hipSuccess); + if (att.devicePointer && ptr == att.devicePointer ) { + return true; + } else { + return false; + } + } +#endif + + return false; -void run_scenario(double *array, const double*array_in_ref, const double *array_out_ref, +} + +void run_scenario_using_host_data(double *array, const double *array_in_ref, const double *array_out_ref, int nblocks, int nlev, int nproma, const std::function<void(double*, int, int, int)> &scenario) { const size_t nbytes = nproma*nlev*nblocks*sizeof(double); @@ -742,17 +772,113 @@ void run_scenario(double *array, const double*array_in_ref, const double *array_ openacc_calls(array, array_out_ref, nblocks, nlev, nproma, scenario); } -int main() { - int64_t ncells64 = atoi(std::getenv("NCELLS")); - assert(ncells64 < INT_MAX); - int64_t nproma64 = atoi(std::getenv("NPROMA")); - assert(nproma64 < INT_MAX); - int ncells = ncells64; - int nlev = atoi(std::getenv("NLEV")); - int nproma = nproma64; +void default_run_scenario(double *array, const double *array_in_ref, const double *array_out_ref, + int nblocks, int nlev, int nproma, + const std::function<void(double*, int, int, int)> &scenario) { + // init: + Kokkos::View<double*, Kokkos::MemoryUnmanaged> view(array, nblocks*nlev*nproma); + Kokkos::View<const double*, Kokkos::MemoryUnmanaged> view_in_ref(array_in_ref, nblocks*nlev*nproma); + Kokkos::deep_copy(view, view_in_ref); + + // run: + scenario(view.data(), nblocks, nlev, nproma); + + // check: + validate_max_error(array, array_out_ref, nblocks, nlev, nproma); +} + +struct Domain { + int ncells; + int nlev; + int nblocks; + int nproma; +}; + +int get_int(const char* str, const int default_value) { + const char* istr = std::getenv(str); + if (not istr) { + printf("get_int: key=%s, using default=%d\n",str, default_value); + return default_value; + } + assert(istr); + int64_t i64 = atoi(istr); + assert(i64 >= INT_MIN and i64 <= INT_MAX); + int i = i64; + return i; +} + +Domain get_domain() { + int ncells = get_int("NCELLS", 0); + int nproma = get_int("NPROMA", 0); + int nlev = get_int("NLEV", 0); int nblocks = (ncells - 1) / nproma + 1; assert(nproma*nblocks == ncells); + Domain dom; + dom.ncells = ncells; + dom.nlev = nlev; + dom.nproma = nproma; + dom.nblocks = nblocks; + return dom; +} + +extern "C" +void f2c_get_domain(Domain *dom) { + Domain td = get_domain(); + *dom = get_domain(); +} + + +void default_run_all(double* field, double* ref_in, double* ref_out, int nblocks, int nlev, int nproma) { + verbose_level=get_int("VERBOSE_LEVEL", 0); + if (verbose_level > 0) printf("default_run_all: start\n"); + + assert(is_device_ptr(field) == is_device_ptr(ref_in)); + assert(is_device_ptr(field) == is_device_ptr(ref_out)); + + std::vector funs = { + scenario_1_noprint, + scenario_1b, scenario_1c, scenario_1cmacro, + scenario_6b, + scenario_7bmacro, scenario_7btemplate, + scenario_8, scenario_8macro, scenario_8template + }; + + if (exec_on_device() and (nproma > 50000)) { + printf("skip scenario_7b: GPU parallel over blocks way too slow\n\n"); + } else { + funs.push_back(scenario_7b); + } + + for (const auto &f : funs) { + default_run_scenario(field, ref_in, ref_out, + nblocks, nlev, nproma, + f); + printf("====\n"); + } + if (verbose_level > 0) printf("default_run_all: end\n"); +} + +extern "C" +void f2c_default_run_all(double* field, double* ref_in, double* ref_out, int nblocks, int nlev, int nproma) { + default_run_all(field, ref_in, ref_out, nblocks, nlev, nproma); +} + +extern "C" void f2c_kokkos_initialize() { + Kokkos::initialize(); +} + +extern "C" void f2c_kokkos_finalize() { + Kokkos::finalize(); +} + +void cpp_run_tests() { + const Domain dom = get_domain(); + int nlev = dom.nlev; + int nproma = dom.nproma; + int nblocks = dom.nblocks; + + verbose_level=get_int("VERBOSE_LEVEL", 0); std::cout << "nblocks=" << nblocks << ", nlev=" << nlev << ", nproma=" << nproma << std::endl; @@ -760,7 +886,6 @@ int main() { double array_in_ref[nblocks * nlev * nproma]; double array[nblocks * nlev * nproma]; const size_t nbytes = nproma*nlev*nblocks*sizeof(double); - Kokkos::initialize(); { init_array(array_in_ref, nproma,nlev,nblocks); @@ -774,9 +899,9 @@ int main() { //memcpy(array, array_in_ref, nbytes); //scenario_1(array, nblocks, nlev, nproma); //validate(array, array_out_ref, nblocks, nlev, nproma); - run_scenario(array, array_in_ref, array_out_ref, - nblocks, nlev, nproma, - scenario_6b); + run_scenario_using_host_data(array, array_in_ref, array_out_ref, + nblocks, nlev, nproma, + scenario_6b); memcpy(array, array_in_ref, nbytes); std::function<void(double*, int, int, int)> s_1b = scenario_1b; openacc_calls(array, array_out_ref, nblocks, nlev, nproma, s_1b); @@ -834,7 +959,6 @@ int main() { */ } - Kokkos::finalize(); - return 0; + } diff --git a/script.sh b/script.sh index ae78a01db5b7d5443b81900aa409e70930af2964..cfa780001f6aeb7d247289350b9762e69fbbbef6 100755 --- a/script.sh +++ b/script.sh @@ -4,14 +4,20 @@ #nvhpc/24.7-gcc-11.2.0 #export LD_LIBRARY_PATH + ulimit -s unlimited set -e +# set DEVICE: +source conf.sh + +export VERBOSE_LEVEL=2 + if [ "$1" == 'gpu' ] then #rm -rf build_gpu - cmake -B build_gpu -S . -DMU_ARCH=a100 -DCMAKE_CXX_FLAGS="-O3" + cmake -B build_gpu -S . -DMU_ARCH=$DEVICE -DCMAKE_CXX_FLAGS="-O3" cmake --build build_gpu --parallel ncells=(5000064) @@ -20,7 +26,7 @@ then #nproma=$((449*29)) else #rm -rf build - cmake -B build -S . -DMU_ARCH=x86_64 -DCMAKE_CXX_FLAGS="-O3" + cmake -B build -S . -DMU_ARCH=x86_64 -DCMAKE_CXX_FLAGS="-O3" -DCMAKE_Fortran_FLAGS="-O3 -fopenmp" cmake --build build --parallel ncells=(5000064) @@ -30,6 +36,7 @@ else export OMP_PROC_BIND=close export OMP_PLACES=cores export OMP_NUM_THREADS=8 + echo "OMP_NUM_THREADS=${OMP_NUM_THREADS}" fi for jb in ${ncells[*]}; do @@ -40,10 +47,12 @@ for jb in ${ncells[*]}; do export NCELLS=$jb if [ "$1" == 'gpu' ] then - ./build_gpu/demo + #./build_gpu/demo + ./build_gpu/fdemo else - ./build/demo - echo "---" + #./build/demo + ./build/fdemo + echo "--------------------------------------------------------" fi done done