Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • k202174/demo
1 result
Show changes
Commits on Source (2)
cmake_minimum_required(VERSION 3.20)
project(demo LANGUAGES CXX VERSION 0.0.1)
project(demo LANGUAGES CXX Fortran VERSION 0.0.1)
include(FetchContent)
......@@ -30,15 +30,42 @@ elseif("${MU_ARCH}" STREQUAL "a100")
set(Kokkos_ENABLE_SERIAL ON CACHE BOOL "" FORCE)
set(Kokkos_ENABLE_CUDA ON CACHE BOOL "" FORCE)
set(Kokkos_ARCH_AMPERE80 ON CACHE BOOL "" FORCE)
elseif("${MU_ARCH}" STREQUAL "mi250x")
set(Kokkos_ENABLE_SERIAL ON CACHE BOOL "" FORCE)
set(Kokkos_ENABLE_HIP ON CACHE BOOL "" FORCE)
set(Kokkos_ARCH_AMD_GFX90A ON CACHE BOOL "" FORCE)
else()
message(FATAL_ERROR "${MU_ARCH} is not a valid/tested configuration, select architecture, x86_64, a100, h100, mi250x, mi300a(-unified), h100(-unified)")
endif()
FetchContent_MakeAvailable(kokkos)
add_executable(demo main.cpp)
add_executable(demo cdriver.cpp main.cpp)
target_link_libraries(demo PUBLIC Kokkos::kokkos)
set_source_files_properties(
fdriver.f90
PROPERTIES Fortran_PREPROCESS ON
)
if (Kokkos_ENABLE_CUDA OR Kokkos_ENABLE_HIP)
set(WITH_OPENACC_FORTRAN 1)
else()
set(WITH_OPENACC_FORTRAN 0)
endif()
add_executable(fdemo fdriver.f90 main.cpp)
target_link_libraries(fdemo PUBLIC Kokkos::kokkos)
if (WITH_OPENACC_FORTRAN)
#find_package(OpenACC REQUIRED OpenACC_Fortran)
find_package(OpenACC)
if (NOT ${OpenACC_Fortran_FOUND})
message(FATAL_ERROR "OpenACC_Fortran_FOUND=${OpenACC_Fortran_FOUND}")
endif()
target_link_libraries(fdemo PUBLIC OpenACC::OpenACC_Fortran )
endif()
set_property(TARGET fdemo PROPERTY LINKER_LANGUAGE Fortran)
if ("${MU_ARCH}" STREQUAL "a100")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -acc -Minfo=accel -gpu=cc80")
add_compile_definitions(DEMO_DEVICE)
......@@ -46,3 +73,7 @@ if ("${MU_ARCH}" STREQUAL "a100")
target_link_options(demo PUBLIC "-gpu=pinned")
endif()
if ("${MU_ARCH}" STREQUAL "mi250x")
add_compile_definitions(DEMO_DEVICE)
endif()
#include <Kokkos_Core.hpp>
void cpp_run_tests();
int main() {
Kokkos::initialize();
cpp_run_tests();
Kokkos::finalize();
return 0;
}
# source this
case $(hostname) in
levante*.dkrz.de|vader*.dkrz.de)
export DEVICE=a100
;;
nid*) # lumi-g
export DEVICE=mi250x
;;
*)
echo "unknown system: $(hostname)"
exit 1
;;
esac
echo "$0: DEVICE=$DEVICE"
......@@ -8,10 +8,15 @@ ulimit -s unlimited
set -e
# set DEVICE:
source conf.sh
export VERBOSE_LEVEL=2
if [ "$1" == 'gpu' ]
then
#rm -rf build_gpu
cmake -B build_gpu -S . -DMU_ARCH=a100 -DCMAKE_CXX_FLAGS="-O0 -g"
cmake -B build_gpu -S . -DMU_ARCH=$DEVICE -DCMAKE_CXX_FLAGS="-O0 -g"
cmake --build build_gpu --parallel
ncells=(8)
......@@ -28,7 +33,7 @@ else
export OMP_PROC_BIND=close
export OMP_PLACES=cores
export OMP_NUM_THREADS=8
export OMP_NUM_THREADS=1
fi
for jb in ${ncells[*]}; do
......@@ -39,9 +44,11 @@ for jb in ${ncells[*]}; do
export NCELLS=$jb
if [ "$1" == 'gpu' ]
then
./build_gpu/demo
#./build_gpu/demo
./build_gpu/fdemo
else
./build/demo
#./build/demo
./build/fdemo
fi
echo "---"
done
......
MODULE fsupport
USE, INTRINSIC:: iso_c_binding
#ifdef _OPENACC
USE openacc
#endif
IMPLICIT NONE
PRIVATE
PUBLIC :: dp
PUBLIC :: t_domain
PUBLIC :: get_domain
PUBLIC :: init_field
PUBLIC :: zero_field
PUBLIC :: physop
PUBLIC :: f2c_default_run_all
!PUBLIC :: show_field
PUBLIC :: f2c_kokkos_initialize
PUBLIC :: f2c_kokkos_finalize
INTEGER, PARAMETER :: dp = c_double
TYPE, BIND(c):: t_domain
INTEGER(c_int) :: ncells
INTEGER(c_int) :: nlev
INTEGER(c_int) :: nblocks
INTEGER(c_int) :: nproma
END TYPE t_domain
INTERFACE
SUBROUTINE f2c_kokkos_initialize() BIND(c)
END SUBROUTINE f2c_kokkos_initialize
SUBROUTINE f2c_kokkos_finalize() BIND(c)
END SUBROUTINE f2c_kokkos_finalize
SUBROUTINE f2c_get_domain(dom) BIND(c)
IMPORT t_domain
TYPE(t_domain) :: dom
END SUBROUTINE f2c_get_domain
SUBROUTINE f2c_default_run_all(field, ref_in, ref_out, nblocks, nlev, nproma) BIND(c)
IMPORT c_double
REAL(c_double) :: field(*), ref_in(*), ref_out(*)
INTEGER, VALUE :: nblocks, nlev, nproma
END SUBROUTINE f2c_default_run_all
END INTERFACE
CONTAINS
SUBROUTINE get_domain(dom)
TYPE(t_domain), INTENT(out) :: dom
CALL f2c_get_domain(dom)
END SUBROUTINE get_domain
SUBROUTINE init_field(field)
REAL(dp), INTENT(out) :: field(:,:,:)
INTEGER :: j,k,i
INTEGER :: nj,nk,ni
nj = SIZE(field,3)
nk = SIZE(field,2)
ni = SIZE(field,1)
!PRINT*,'ni,nk,nj=',ni,nk,nj
DO j = 1, nj
!$ACC parallel default(present)
!$ACC LOOP GANG VECTOR COLLAPSE(2)
DO k = 1, nk
DO i = 1, ni
field(i,k,j) = REAL(i-1 + (k-1)*ni + (j-1)*ni*nk, dp)
!PRINT*,'i,k,j,val=',i,k,j, field(i,k,j)
ENDDO
ENDDO
!$ACC END PARALLEL
ENDDO
END SUBROUTINE init_field
SUBROUTINE zero_field(v)
REAL(dp), INTENT(out) :: v(:,:,:)
INTEGER :: j,k,i
INTEGER :: nj,nk,ni
nj = SIZE(v,3)
nk = SIZE(v,2)
ni = SIZE(v,1)
DO j = 1, nj
!$ACC parallel default(present)
!$ACC LOOP GANG VECTOR COLLAPSE(2)
DO k = 1, nk
DO i = 1, ni
v(i,k,j) = 0.0_dp
ENDDO
ENDDO
!$ACC END PARALLEL
ENDDO
END SUBROUTINE zero_field
SUBROUTINE physop(v)
REAL(dp), INTENT(out) :: v(:,:,:)
INTEGER :: j,k,i
INTEGER :: nj,nk,ni
nj = SIZE(v,3)
nk = SIZE(v,2)
ni = SIZE(v,1)
PRINT*,'physop: ni,nk,nj=',ni,nk,nj
DO j = 1, nj
!$ACC PARALLEL DEFAULT(PRESENT)
k = 1
!$ACC LOOP GANG VECTOR
DO i = 1, ni
v(i,k,j) = v(i,k,j) + v(i,k+1,j)/10000
ENDDO
!$ACC LOOP SEQ
DO k = 2, nk-1
!$ACC LOOP GANG VECTOR
DO i = 1, ni
v(i,k,j) = v(i,k,j) + v(i,k-1,j)/100 + v(i,k+1,j)/10000;
ENDDO
ENDDO
k = nk
!$ACC LOOP GANG VECTOR
DO i = 1, ni
v(i,k,j) = v(i,k,j) + v(i,k-1,j)/100;
ENDDO
!$ACC END PARALLEL
ENDDO
END SUBROUTINE physop
#if 0
SUBROUTINE show_field(label,f)
CHARACTER(len=*), INTENT(in) :: label
REAL(dp), INTENT(in) :: f(:,:,:)
INTEGER :: i,k,j
INTEGER :: ni,nk,nj
nj = SIZE(f,1)
nk = SIZE(f,2)
ni = SIZE(f,3)
DO j = 1, nj
!$ACC PARALLEL DEFAULT(PRESENT)
!$ACC LOOP SEQ
DO k = 1, nk
DO i = 1, ni
PRINT '(A,3I8,F)','show_field: '//label,j,k,i,f(i,k,j)
ENDDO
ENDDO
!$ACC END PARALLEL
ENDDO
END SUBROUTINE show_field
#endif
END MODULE fsupport
PROGRAM fdriver
USE fsupport, ONLY: dp, t_domain, get_domain, init_field, zero_field, physop, &
& f2c_default_run_all, f2c_kokkos_initialize, f2c_kokkos_finalize
IMPLICIT NONE
TYPE(t_domain) :: dom
REAL(dp), ALLOCATABLE :: field(:,:,:), ref_in(:,:,:), ref_out(:,:,:)
CALL f2c_kokkos_initialize()
PRINT*,'fdriver::start'
CALL get_domain(dom)
PRINT '(A,4I8)','ncells,lev,nblocks,nproma=',dom%ncells,dom%nlev,dom%nblocks,dom%nproma
ALLOCATE( field(dom%nproma,dom%nlev,dom%nblocks), &
& ref_in(dom%nproma,dom%nlev,dom%nblocks), &
& ref_out(dom%nproma,dom%nlev,dom%nblocks) )
!$acc enter data create(field, ref_in, ref_out)
CALL zero_field(field)
CALL init_field(ref_in)
CALL init_field(ref_out)
CALL physop(ref_out)
!$acc host_data use_device(field, ref_in, ref_out)
CALL f2c_default_run_all(field, ref_in, ref_out, dom%nblocks, dom%nlev, dom%nproma)
!$acc end host_data
!$acc exit data copyout(field, ref_in, ref_out)
CALL f2c_kokkos_finalize()
PRINT*,'fdriver:end'
END PROGRAM fdriver
#!/bin/bash
#SBATCH --job-name=demo_cpu
#SBATCH --output=demo_cpu.o%j
#SBATCH --error=demo_cpu.o%j
#SBATCH --partition=dev-g
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=8
#SBATCH --time=00:10:00
#SBATCH --account=project_465001141
export VERBOSE=1
#./debug_script.sh
./script.sh
#!/bin/bash
#SBATCH --job-name=demo_gpu # Job name
#SBATCH --output=demo_gpu.o%j # Name of stdout output file
#SBATCH --error=demo_gpu.o%j # Name of stderr error file
#SBATCH --partition=dev-g # partition name : standard-g
#SBATCH --gpus=1
#SBATCH --time=00:10:00 # Run time (d-hh:mm:ss)
#SBATCH --account=project_465001141 # Project for billing
pwd
#./debug_script.sh gpu
./script.sh gpu
# source this
module load LUMI/24.03
module load buildtools/24.03
module load googletest/1.14.0-cpeCray-24.03
module load PrgEnv-cray
export LD_LIBRARY_PATH=${CRAY_LD_LIBRARY_PATH}:$LD_LIBRARY_PATH
echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH"
# load GPU support
#module load craype-accel-amd-gfx90a
module load rocm
module load cce/17.0.1 craype-x86-milan
# new: python with jinja2
#source ~/venv/bin/activate
#export CRAYPE_LINK_TYPE=dynamic
export CXX=CC
export FC=ftn
# source this
module load LUMI/24.03
module load buildtools/24.03
module load googletest/1.14.0-cpeCray-24.03
module load PrgEnv-cray
export LD_LIBRARY_PATH=${CRAY_LD_LIBRARY_PATH}:$LD_LIBRARY_PATH
echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH"
# load GPU support
module load craype-accel-amd-gfx90a
module load rocm
# new: python with jinja2
source ~/venv/bin/activate
export CRAYPE_LINK_TYPE=dynamic
export CXX=CC
export FC=ftn
#include <Kokkos_Core.hpp>
#include <functional>
#undef NDEBUG // always use assertion
#include <cassert>
#include <iostream>
#include <utility>
......@@ -13,15 +14,10 @@ using space_t = Kokkos::DefaultExecutionSpace::memory_space;
typedef Kokkos::MDRangePolicy<Kokkos::DefaultExecutionSpace, Kokkos::IndexType<int>, Kokkos::Rank<3>> md_range_policy;
Kokkos::Timer timer;
//Kokkos::Timer transposition_timer;
//Kokkos::Timer merge_horizontal_timer;
// constexpr int nblocks = 2;
// constexpr int nlev = 90;
// constexpr int nproma = 55000;
#define ENABLE_CHECK_BOUNDS
static int verbose_level = 0;
struct AllLevels {
constexpr static int value = 0;
......@@ -50,6 +46,27 @@ static void validate(double* array, const double *ref, int nblocks, int nlev, in
}
}
inline static HOST_DEVICE_ATTRIBUTES double ferror(double x1, double x0) {
constexpr double eps = 1.e-14;
if (fabs(x0) > eps) {
return fabs(x1-x0)/fabs(x0);
} else {
return fabs(x1-x0);
}
}
static void validate_max_error(double* array, const double *ref, int nblocks, int nlev, int nproma) {
const int ntot = nblocks*nlev*nproma;
double totmax;
Kokkos::parallel_reduce("validator", ntot, KOKKOS_LAMBDA(const int &i, double &emax) {
double err = ferror(array[i],ref[i]);
emax = fmax(emax, err);
}, Kokkos::Max<double>(totmax));
if (verbose_level > 0) printf("validate_max_error: total max error=%.10e\n",totmax);
assert(totmax < 1.e-14);
}
KOKKOS_INLINE_FUNCTION void check_bounds(int i0, int i1, int i2, int n0, int n1, int n2) {
#ifdef ENABLE_CHECK_BOUNDS
assert(i0 >= 0 && i1 >= 0 && i2 >= 0 && i0 < n0 && i1 < n1 && i2 < n2);
......@@ -62,37 +79,6 @@ KOKKOS_INLINE_FUNCTION void check_bounds(int i0, int i1, int n0, int n1) {
#endif
}
template<typename ViewType>
void show_view(const std::string &label, const ViewType &view, const bool with_values=false) {
printf("show_view: label=%s, ",label.c_str());
using MyLayout = typename ViewType::array_layout;
if (std::is_same<MyLayout, Kokkos::LayoutLeft>::value) {
printf("Layout=LayoutLeft\n");
} else if (std::is_same<MyLayout, Kokkos::LayoutRight>::value) {
printf("Layout=LayoutRight\n");
} else {
printf("Layout: **unknown**\n");
}
if (with_values) {
#ifdef DEMO_DEVICE
auto h_view = Kokkos::create_mirror_view_and_copy(Kokkos::Serial(),view);
#else
auto &h_view = view;
#endif
const int n0 = h_view.extent(0);
const int n1 = h_view.extent(1);
const int n2 = h_view.extent(2);
printf("n0=%d, n1=%d, n2=%d\n",n0,n1,n2);
for (int i0=0; i0 < n0; ++i0) {
for (int i2=0; i2 < n2; ++i2) {
for (int i1=0; i1 < n1; ++i1) {
printf("show_view: i0=%d, i1=%d, i2=%d, v=%f\n",i0,i1,i2, h_view(i0,i1,i2));
}
};
};
}
Kokkos::fence();
}
template<typename ViewType>
inline HOST_DEVICE_ATTRIBUTES void update_column(ViewType &v) {
......@@ -125,7 +111,6 @@ void scenario_1(double* array, int nblocks, int nlev, int nproma, bool print = t
auto d_view = Kokkos::create_mirror_view_and_copy(space_t(), view);
timer.reset();
for (int jb = 0; jb < nblocks; ++jb)
Kokkos::parallel_for(
"", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA(const int jc) {
......@@ -133,18 +118,20 @@ void scenario_1(double* array, int nblocks, int nlev, int nproma, bool print = t
});
Kokkos::fence();
Kokkos::fence();
if (print) printf("Time = %f ms\n\n", timer.seconds() * 1000);
Kokkos::deep_copy(view, d_view);
//validate(array, nblocks, nlev, nproma);
}
void scenario_1_noprint(double* array, int nblocks, int nlev, int nproma) {
printf("scenario_1_noprint\n");
scenario_1(array, nblocks, nlev, nproma, false);
}
void scenario_1b(double* array, int nblocks, int nlev, int nproma) {
std::cout << "scenario 1b: always LayoutRight; view(array, nblocks, nlev, nproma); parallel: nproma ----- "
<< std::endl;
<< std::endl;
Kokkos::View<double***, Kokkos::LayoutRight, Kokkos::MemoryUnmanaged> d_view(array, nblocks, nlev, nproma);
timer.reset();
......@@ -170,6 +157,8 @@ void scenario_1c(double* array, int nblocks, int nlev, int nproma) {
//Kokkos::View<double**, Kokkos::MemoryUnmanaged> d_view(array, ncells, nlev);
Kokkos::View<double**> d_view2d("d_view2d", ncells, nlev);
Kokkos::Timer tr_timer;
Kokkos::Timer total_timer;
total_timer.reset();
// view3d -> view2d:
tr_timer.reset();
......@@ -182,7 +171,7 @@ void scenario_1c(double* array, int nblocks, int nlev, int nproma) {
});
});
Kokkos::fence();
printf("Time transposition 3d -> 2d= %f ms\n", tr_timer.seconds() * 1000);
printf("Transposition 3d -> 2d = %f ms\n", tr_timer.seconds() * 1000);
timer.reset();
Kokkos::parallel_for(
......@@ -192,7 +181,7 @@ void scenario_1c(double* array, int nblocks, int nlev, int nproma) {
update_column(column);
});
Kokkos::fence();
printf("Time = %f ms\n", timer.seconds() * 1000);
printf("Kernel = %f ms\n", timer.seconds() * 1000);
// view2d -> view3d:
tr_timer.reset();
......@@ -205,7 +194,9 @@ void scenario_1c(double* array, int nblocks, int nlev, int nproma) {
});
});
Kokkos::fence();
printf("Time transposition 2d -> 3d= %f ms\n\n", tr_timer.seconds() * 1000);
printf("Transposition 2d -> 3d = %f ms\n", tr_timer.seconds() * 1000);
printf("Time = %f ms\n\n", total_timer.seconds() * 1000);
}
......@@ -223,6 +214,8 @@ void scenario_1cmacro(double* array, int nblocks, int nlev, int nproma) {
Kokkos::View<double***, Kokkos::LayoutLeft, Kokkos::MemoryUnmanaged> d_view3d(array, nproma, nlev, nblocks);
Kokkos::Timer tr_timer;
Kokkos::Timer total_timer;
total_timer.reset();
// view3d -> view2d:
tr_timer.reset();
......@@ -235,7 +228,7 @@ void scenario_1cmacro(double* array, int nblocks, int nlev, int nproma) {
});
});
Kokkos::fence();
printf("Time transposition 3d -> 2d= %f ms\n", tr_timer.seconds() * 1000);
printf("Transposition 3d -> 2d = %f ms\n", tr_timer.seconds() * 1000);
timer.reset();
......@@ -247,7 +240,7 @@ void scenario_1cmacro(double* array, int nblocks, int nlev, int nproma) {
});
Kokkos::fence();
printf("Time = %f ms\n", timer.seconds() * 1000);
printf("Kernel = %f ms\n", timer.seconds() * 1000);
// view2d -> view3d:
tr_timer.reset();
......@@ -260,7 +253,8 @@ void scenario_1cmacro(double* array, int nblocks, int nlev, int nproma) {
});
});
Kokkos::fence();
printf("Time transposition 2d -> 3d= %f ms\n\n", tr_timer.seconds() * 1000);
printf("Transposition 2d -> 3d = %f ms\n", tr_timer.seconds() * 1000);
printf("Time = %f ms\n\n", total_timer.seconds() * 1000);
}
......@@ -580,8 +574,7 @@ void scenario_7(double* array, int nblocks, int nlev, int nproma, bool print=tru
#endif
void scenario_7b(double* array, int nblocks, int nlev, int nproma) {
std::cout << "scenario 7b: +ACC modifiedDefault layout; CPUview(array, blocks, nlev, nproma); GPUview(nproma, nlev, nblocks), parallel blocks ----- " << std::endl;
std::cout << "scenario 7b: +ACC Default layout; CPUview(array, blocks, nlev, nproma); GPUview(nproma, nlev, nblocks), parallel blocks ----- " << std::endl;
#ifdef DEMO_DEVICE
Kokkos::View<double***, Kokkos::MemoryUnmanaged> d_view(array, nproma, nlev, nblocks);
#else
......@@ -606,14 +599,13 @@ void scenario_7b(double* array, int nblocks, int nlev, int nproma) {
void scenario_7bmacro(double* array, int nblocks, int nlev, int nproma) {
std::cout << "scenario 7bmacro: +ACC CPU:LayoutLeft GPU:LayoutRight; CPUview(array, nblocks, nlev, nproma); GPUview(nproma, nlev, nblocks) parallel: asICON----- " << std::endl;
std::cout << "scenario 7bmacro: +ACC CPU:LayoutLeft GPU:LayoutRight; CPUview(array, nblocks, nlev, nproma); GPUview(nproma, nlev, nblocks) para: asICON" << std::endl;
#if defined(DEMO_DEVICE)
Kokkos::View<double***, Kokkos::LayoutLeft, Kokkos::MemoryUnmanaged> d_view(array, nproma, nlev, nblocks);
#else
Kokkos::View<double***, Kokkos::LayoutRight, Kokkos::MemoryUnmanaged> d_view(array, nblocks, nlev, nproma);
#endif
show_view("7bmacro",d_view);
timer.reset();
outer_for("", 0, nblocks, outer_lambda(const int jb){
inner_for("",0, nproma, inner_lambda(const int jc) {
......@@ -649,7 +641,6 @@ void scenario_8(double* array, int nblocks, int nlev, int nproma) {
std::cout << "scenario 8: +ACC always LayoutLeft view(array, nproma, nlev, nblocks) parallel: nproma ----- " << std::endl;
Kokkos::View<double***, Kokkos::LayoutLeft, Kokkos::MemoryUnmanaged> d_view(array, nproma, nlev, nblocks) ;
show_view("scenario_8", d_view);
timer.reset();
for (int jb = 0; jb < nblocks; ++jb)
Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA(const int jc) {
......@@ -662,7 +653,7 @@ void scenario_8(double* array, int nblocks, int nlev, int nproma) {
void scenario_8macro(double* array, int nblocks, int nlev, int nproma) {
std::cout << "scenario 8macro: +ACC always LayoutLeftm, view(array, nproma, nlev, nblocks); parallel: asICON ----- " << std::endl;
std::cout << "scenario 8macro: +ACC always LayoutLeft, view(array, nproma, nlev, nblocks); parallel: asICON ----- " << std::endl;
using space_t = Kokkos::DefaultExecutionSpace::memory_space;
Kokkos::View<double***, Kokkos::LayoutLeft, Kokkos::MemoryUnmanaged> d_view(array, nproma, nlev, nblocks);
timer.reset();
......@@ -733,8 +724,47 @@ void physop(double *array, int nproma, int nlev, int nblocks) {
}
}
bool is_device_ptr(const void* ptr) {
using ExecSpace = Kokkos::DefaultExecutionSpace;
#ifdef KOKKOS_ENABLE_CUDA
constexpr bool exec_on_cuda = std::is_same<ExecSpace,Kokkos::Cuda>::value;
if constexpr (exec_on_cuda) {
cudaPointerAttributes att;
auto err = cudaPointerGetAttributes (&att, ptr);
assert(err == cudaSuccess);
if (verbose_level > 2 ) {
printf("is_device_ptr: device=%d\n",att.device);
printf("is_device_ptr: memtype=%d\n",att.type );
printf("is_device_ptr: ptr=%p\n",ptr);
printf("is_device_ptr: d_ptr=%p\n",att.devicePointer);
printf("is_device_ptr: h_ptr=%p\n",att.hostPointer);
}
if (att.devicePointer && ptr == att.devicePointer ) {
return true;
} else {
return false;
}
}
#endif
#ifdef KOKKOS_ENABLE_HIP
constexpr bool exec_on_hip = std::is_same<ExecSpace,Kokkos::HIP>::value;
if constexpr (exec_on_hip) {
hipPointerAttribute_t att;
auto err = hipPointerGetAttributes(&att, ptr);
assert(err == hipSuccess);
if (att.devicePointer && ptr == att.devicePointer ) {
return true;
} else {
return false;
}
}
#endif
return false;
void run_scenario(double *array, const double*array_in_ref, const double *array_out_ref,
}
void run_scenario_using_host_data(double *array, const double *array_in_ref, const double *array_out_ref,
int nblocks, int nlev, int nproma,
const std::function<void(double*, int, int, int)> &scenario) {
const size_t nbytes = nproma*nlev*nblocks*sizeof(double);
......@@ -742,17 +772,113 @@ void run_scenario(double *array, const double*array_in_ref, const double *array_
openacc_calls(array, array_out_ref, nblocks, nlev, nproma, scenario);
}
int main() {
int64_t ncells64 = atoi(std::getenv("NCELLS"));
assert(ncells64 < INT_MAX);
int64_t nproma64 = atoi(std::getenv("NPROMA"));
assert(nproma64 < INT_MAX);
int ncells = ncells64;
int nlev = atoi(std::getenv("NLEV"));
int nproma = nproma64;
void default_run_scenario(double *array, const double *array_in_ref, const double *array_out_ref,
int nblocks, int nlev, int nproma,
const std::function<void(double*, int, int, int)> &scenario) {
// init:
Kokkos::View<double*, Kokkos::MemoryUnmanaged> view(array, nblocks*nlev*nproma);
Kokkos::View<const double*, Kokkos::MemoryUnmanaged> view_in_ref(array_in_ref, nblocks*nlev*nproma);
Kokkos::deep_copy(view, view_in_ref);
// run:
scenario(view.data(), nblocks, nlev, nproma);
// check:
validate_max_error(array, array_out_ref, nblocks, nlev, nproma);
}
struct Domain {
int ncells;
int nlev;
int nblocks;
int nproma;
};
int get_int(const char* str, const int default_value) {
const char* istr = std::getenv(str);
if (not istr) {
printf("get_int: key=%s, using default=%d\n",str, default_value);
return default_value;
}
assert(istr);
int64_t i64 = atoi(istr);
assert(i64 >= INT_MIN and i64 <= INT_MAX);
int i = i64;
return i;
}
Domain get_domain() {
int ncells = get_int("NCELLS", 0);
int nproma = get_int("NPROMA", 0);
int nlev = get_int("NLEV", 0);
int nblocks = (ncells - 1) / nproma + 1;
assert(nproma*nblocks == ncells);
Domain dom;
dom.ncells = ncells;
dom.nlev = nlev;
dom.nproma = nproma;
dom.nblocks = nblocks;
return dom;
}
extern "C"
void f2c_get_domain(Domain *dom) {
Domain td = get_domain();
*dom = get_domain();
}
void default_run_all(double* field, double* ref_in, double* ref_out, int nblocks, int nlev, int nproma) {
verbose_level=get_int("VERBOSE_LEVEL", 0);
if (verbose_level > 0) printf("default_run_all: start\n");
assert(is_device_ptr(field) == is_device_ptr(ref_in));
assert(is_device_ptr(field) == is_device_ptr(ref_out));
std::vector funs = {
scenario_1_noprint,
scenario_1b, scenario_1c, scenario_1cmacro,
scenario_6b,
scenario_7bmacro, scenario_7btemplate,
scenario_8, scenario_8macro, scenario_8template
};
if (exec_on_device() and (nproma > 50000)) {
printf("skip scenario_7b: GPU parallel over blocks way too slow\n\n");
} else {
funs.push_back(scenario_7b);
}
for (const auto &f : funs) {
default_run_scenario(field, ref_in, ref_out,
nblocks, nlev, nproma,
f);
printf("====\n");
}
if (verbose_level > 0) printf("default_run_all: end\n");
}
extern "C"
void f2c_default_run_all(double* field, double* ref_in, double* ref_out, int nblocks, int nlev, int nproma) {
default_run_all(field, ref_in, ref_out, nblocks, nlev, nproma);
}
extern "C" void f2c_kokkos_initialize() {
Kokkos::initialize();
}
extern "C" void f2c_kokkos_finalize() {
Kokkos::finalize();
}
void cpp_run_tests() {
const Domain dom = get_domain();
int nlev = dom.nlev;
int nproma = dom.nproma;
int nblocks = dom.nblocks;
verbose_level=get_int("VERBOSE_LEVEL", 0);
std::cout << "nblocks=" << nblocks << ", nlev=" << nlev << ", nproma=" << nproma << std::endl;
......@@ -760,7 +886,6 @@ int main() {
double array_in_ref[nblocks * nlev * nproma];
double array[nblocks * nlev * nproma];
const size_t nbytes = nproma*nlev*nblocks*sizeof(double);
Kokkos::initialize();
{
init_array(array_in_ref, nproma,nlev,nblocks);
......@@ -774,9 +899,9 @@ int main() {
//memcpy(array, array_in_ref, nbytes);
//scenario_1(array, nblocks, nlev, nproma);
//validate(array, array_out_ref, nblocks, nlev, nproma);
run_scenario(array, array_in_ref, array_out_ref,
nblocks, nlev, nproma,
scenario_6b);
run_scenario_using_host_data(array, array_in_ref, array_out_ref,
nblocks, nlev, nproma,
scenario_6b);
memcpy(array, array_in_ref, nbytes);
std::function<void(double*, int, int, int)> s_1b = scenario_1b;
openacc_calls(array, array_out_ref, nblocks, nlev, nproma, s_1b);
......@@ -834,7 +959,6 @@ int main() {
*/
}
Kokkos::finalize();
return 0;
}
......@@ -4,14 +4,20 @@
#nvhpc/24.7-gcc-11.2.0
#export LD_LIBRARY_PATH
ulimit -s unlimited
set -e
# set DEVICE:
source conf.sh
export VERBOSE_LEVEL=2
if [ "$1" == 'gpu' ]
then
#rm -rf build_gpu
cmake -B build_gpu -S . -DMU_ARCH=a100 -DCMAKE_CXX_FLAGS="-O3"
cmake -B build_gpu -S . -DMU_ARCH=$DEVICE -DCMAKE_CXX_FLAGS="-O3"
cmake --build build_gpu --parallel
ncells=(5000064)
......@@ -20,7 +26,7 @@ then
#nproma=$((449*29))
else
#rm -rf build
cmake -B build -S . -DMU_ARCH=x86_64 -DCMAKE_CXX_FLAGS="-O3"
cmake -B build -S . -DMU_ARCH=x86_64 -DCMAKE_CXX_FLAGS="-O3" -DCMAKE_Fortran_FLAGS="-O3 -fopenmp"
cmake --build build --parallel
ncells=(5000064)
......@@ -30,6 +36,7 @@ else
export OMP_PROC_BIND=close
export OMP_PLACES=cores
export OMP_NUM_THREADS=8
echo "OMP_NUM_THREADS=${OMP_NUM_THREADS}"
fi
for jb in ${ncells[*]}; do
......@@ -40,10 +47,12 @@ for jb in ${ncells[*]}; do
export NCELLS=$jb
if [ "$1" == 'gpu' ]
then
./build_gpu/demo
#./build_gpu/demo
./build_gpu/fdemo
else
./build/demo
echo "---"
#./build/demo
./build/fdemo
echo "--------------------------------------------------------"
fi
done
done
......