Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • k202174/demo
1 result
Show changes
Commits on Source (2)
#!/bin/bash
#gcc
#nvhpc/24.7-gcc-11.2.0
#export LD_LIBRARY_PATH
ulimit -s unlimited
set -e
if [ "$1" == 'gpu' ]
then
#rm -rf build_gpu
cmake -B build_gpu -S . -DMU_ARCH=a100 -DCMAKE_CXX_FLAGS="-O0 -g"
cmake --build build_gpu --parallel
ncells=(8)
nlev=(3)
nproma=(2) # more than one block to check correctness
else
#rm -rf build
cmake -B build -S . -DMU_ARCH=x86_64 -DCMAKE_CXX_FLAGS="-O0 -g"
cmake --build build --parallel
ncells=(8)
nlev=(3)
nproma=(2) #(32 64 96 128)
export OMP_PROC_BIND=close
export OMP_PLACES=cores
export OMP_NUM_THREADS=8
fi
for jb in ${ncells[*]}; do
for jk in ${nlev[*]}; do
for jc in ${nproma[*]}; do
export NPROMA=$jc
export NLEV=$jk
export NCELLS=$jb
if [ "$1" == 'gpu' ]
then
./build_gpu/demo
else
./build/demo
fi
echo "---"
done
done
done
......@@ -3,6 +3,9 @@
#include <Kokkos_Core.hpp>
#define outer_lambda [=]
#define inner_lambda KOKKOS_LAMBDA
namespace icon_loops {
constexpr bool exec_on_device() {
......
......@@ -3,6 +3,7 @@
#include <cassert>
#include <iostream>
#include <utility>
#include <cstring>
#include "Kokkos_Timer.hpp"
#include "icon_loops.hpp"
......@@ -12,32 +13,96 @@ using space_t = Kokkos::DefaultExecutionSpace::memory_space;
typedef Kokkos::MDRangePolicy<Kokkos::DefaultExecutionSpace, Kokkos::IndexType<int>, Kokkos::Rank<3>> md_range_policy;
Kokkos::Timer timer;
//Kokkos::Timer transposition_timer;
//Kokkos::Timer merge_horizontal_timer;
// constexpr int nblocks = 2;
// constexpr int nlev = 90;
// constexpr int nproma = 55000;
//#define ENABLE_CHECK_BOUNDS
//#define ENABLE_CHECK_BOUNDS_2D
#define ENABLE_CHECK_BOUNDS
static void validate(double* array, int nblocks, int nlev, int nproma) {
struct AllLevels {
constexpr static int value = 0;
};
constexpr AllLevels all_levels;
#if defined(DEMO_DEVICE)
# define HOST_DEVICE_ATTRIBUTES __host__ __device__
#else
# define HOST_DEVICE_ATTRIBUTES
#endif
//macro approach:
#ifdef DEMO_DEVICE
# define outer_for(STR, START, END, LAMBDA) for(int jb=START; jb<END; ++jb) {LAMBDA(jb);};
# define inner_for(STR, START, END, LAMBDA) Kokkos::parallel_for(STR, Kokkos::RangePolicy<>(START, END), LAMBDA)
#else
# define outer_for(STR, START, END, LAMBDA) Kokkos::parallel_for(STR, Kokkos::RangePolicy<>(START, END), LAMBDA)
# define inner_for(STR, START, END, LAMBDA) for(int jc=START; jc<END; ++jc) {LAMBDA(jc);};
#endif
static void validate(double* array, double *ref, int nblocks, int nlev, int nproma) {
for (int i = 0; i < nblocks * nlev * nproma; ++i) {
assert(array[i] == static_cast<double>(i));
assert(array[i] == ref[i]);
}
}
KOKKOS_INLINE_FUNCTION void check_bounds(int i1, int i2, int i3, int n1, int n2, int n3) {
KOKKOS_INLINE_FUNCTION void check_bounds(int i0, int i1, int i2, int n0, int n1, int n2) {
#ifdef ENABLE_CHECK_BOUNDS
assert(i1 >= 0 && i2 >= 0 && i3 >= 0 && i1 < n1 && i2 < n2 && i3 < n3);
assert(i0 >= 0 && i1 >= 0 && i2 >= 0 && i0 < n0 && i1 < n1 && i2 < n2);
#endif
}
KOKKOS_INLINE_FUNCTION void check_bounds_2d(int i1, int i2, int n1, int n2) {
#ifdef ENABLE_CHECK_BOUNDS_2D
assert(i1 >= 0 && i2 >= 0 && i1 < n1 && i2 < n2 );
KOKKOS_INLINE_FUNCTION void check_bounds(int i0, int i1, int n0, int n1) {
#ifdef ENABLE_CHECK_BOUNDS
assert(i0 >= 0 && i1 >= 0 && i0 < n0 && i1 < n1 );
#endif
}
template<typename ViewType>
void show_view(const std::string &label, const ViewType &view, const bool with_values=false) {
printf("show_view: label=%s, ",label.c_str());
using MyLayout = typename ViewType::array_layout;
if (std::is_same<MyLayout, Kokkos::LayoutLeft>::value) {
printf("Layout=LayoutLeft\n");
} else if (std::is_same<MyLayout, Kokkos::LayoutRight>::value) {
printf("Layout=LayoutRight\n");
} else {
printf("Layout: **unknown**\n");
}
if (with_values) {
#ifdef DEMO_DEVICE
auto h_view = Kokkos::create_mirror_view_and_copy(Kokkos::Serial(),view);
#else
auto &h_view = view;
#endif
const int n0 = h_view.extent(0);
const int n1 = h_view.extent(1);
const int n2 = h_view.extent(2);
printf("n0=%d, n1=%d, n2=%d\n",n0,n1,n2);
for (int i0=0; i0 < n0; ++i0) {
for (int i2=0; i2 < n2; ++i2) {
for (int i1=0; i1 < n1; ++i1) {
printf("show_view: i0=%d, i1=%d, i2=%d, v=%f\n",i0,i1,i2, h_view(i0,i1,i2));
}
};
};
}
Kokkos::fence();
}
template<typename ViewType>
inline HOST_DEVICE_ATTRIBUTES void update_field(ViewType &v, int i0, AllLevels all, int i2) {
const int nlev = v.extent(1);
check_bounds(i0, i2, v.extent(0), v.extent(2));
for (int k=1; k < nlev-1; ++k) {
v(i0,k,i2) = v(i0,k,i2) + v(i0,k-1,i2)/100 + v(i0,k+1,i2)/10000;
}
}
void scenario_1(double* array, int nblocks, int nlev, int nproma, bool print = true) {
if (print)
std::cout << "scenario 1: Default layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- "
......@@ -53,11 +118,7 @@ void scenario_1(double* array, int nblocks, int nlev, int nproma, bool print = t
for (int jb = 0; jb < nblocks; ++jb)
Kokkos::parallel_for(
"", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA(const int jc) {
for (int jk = 0; jk < nlev; ++jk) {
int p = jb * nlev * nproma + jk * nproma + jc;
check_bounds(jb, jk, jc, d_view.extent(0), d_view.extent(1), d_view.extent(2));
d_view(jb, jk, jc) = p;
}
update_field(d_view, jb, all_levels, jc);
});
Kokkos::fence();
......@@ -65,11 +126,12 @@ void scenario_1(double* array, int nblocks, int nlev, int nproma, bool print = t
if (print) printf("Time = %f ms\n\n", timer.seconds() * 1000);
Kokkos::deep_copy(view, d_view);
validate(array, nblocks, nlev, nproma);
//validate(array, nblocks, nlev, nproma);
}
void scenario_1b(double* array, int nblocks, int nlev, int nproma) {
std::cout << "scenario 1b: Default layout right; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- "
std::cout << "scenario 1b: always LayoutRight; view(array, nblocks, nlev, nproma); parallel: nproma ----- "
<< std::endl;
Kokkos::View<double***, Kokkos::LayoutRight, Kokkos::MemoryUnmanaged> d_view(array, nblocks, nlev, nproma);
......@@ -79,11 +141,7 @@ void scenario_1b(double* array, int nblocks, int nlev, int nproma) {
for (int jb = 0; jb < nblocks; ++jb)
Kokkos::parallel_for(
"", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA(const int jc) {
for (int jk = 0; jk < nlev; ++jk) {
int p = jb * nlev * nproma + jk * nproma + jc;
check_bounds(jb, jk, jc, d_view.extent(0), d_view.extent(1), d_view.extent(2));
d_view(jb, jk, jc) = p;
}
update_field(d_view, jb, all_levels, jc);
});
Kokkos::fence();
......@@ -92,33 +150,30 @@ void scenario_1b(double* array, int nblocks, int nlev, int nproma) {
}
#if 0
void scenario_1c(double* array, int nblocks, int nlev, int nproma) {
std::cout << "scenario 1c: Default layout; view(array, ncells, nlev); d_view(jc, jk) ----- "
<< std::endl;
int ncells = nblocks*nproma;
Kokkos::View<double**, Kokkos::MemoryUnmanaged> d_view(array, ncells, nlev);
Kokkos::View<double**> d_view2d("d_view2d", ncells, nlev);
timer.reset();
merge_horizontal(d_view2d, d_view);
Kokkos::parallel_for(
"", Kokkos::RangePolicy<>(0, ncells), KOKKOS_LAMBDA(const int jc) {
for (int jk = 0; jk < nlev; ++jk) {
#if defined(DEMO_DEVICE)
int p = jk * ncells + jc;
#else
int p = jc * nlev + jk;
#endif
check_bounds_2d(jc, jk, d_view.extent(0), d_view.extent(1));
d_view(jc, jk) = p;
check_bounds(jc, jk, d_view.extent(0), d_view.extent(1));
d_view2d(jc,jk) = d_view2d(jc,jk) + d_view2d(jc,jk-1)/100 + d_view2d(jc,jk+1)*0.25;
}
});
Kokkos::fence();
printf("Time = %f ms\n\n", timer.seconds() * 1000);
}
#endif
#if 0
void scenario_1cmacro(double* array, int nblocks, int nlev, int nproma) {
std::cout << "scenario 1cmacro: +ACC CPU:LayoutLeft GPU:LayoutLeft; as ICON pointer view(array, ncells, nlev); d_view(jc, jk) ----- " << std::endl;
......@@ -146,7 +201,7 @@ void scenario_1cmacro(double* array, int nblocks, int nlev, int nproma) {
printf("Time = %f ms\n\n", timer.seconds() * 1000);
}
#endif
void scenario_2(double* array, int nblocks, int nlev, int nproma, bool print = true) {
if (print)
......@@ -175,9 +230,11 @@ void scenario_2(double* array, int nblocks, int nlev, int nproma, bool print = t
Kokkos::fence();
if (print) printf("Time = %f ms\n\n", timer.seconds() * 1000);
Kokkos::deep_copy(view, d_view);
validate(array, nblocks, nlev, nproma);
//validate(array, nblocks, nlev, nproma);
}
#endif
#if 0
void scenario_2b(double* array, int nblocks, int nlev, int nproma, bool print = true) {
if (print)
std::cout << "scenario 2b: Right 2b layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- "
......@@ -208,7 +265,9 @@ void scenario_2b(double* array, int nblocks, int nlev, int nproma, bool print =
Kokkos::deep_copy(view, d_view);
validate(array, nblocks, nlev, nproma);
}
#endif
#if 0
// slow on CPU
void scenario_3(double* array, int nblocks, int nlev, int nproma, bool print = true) {
if (print)
......@@ -238,7 +297,9 @@ void scenario_3(double* array, int nblocks, int nlev, int nproma, bool print = t
Kokkos::deep_copy(view, d_view);
validate(array, nblocks, nlev, nproma);
}
#endif
#if 0
void scenario_4(double* array, int nblocks, int nlev, int nproma, bool print = true) {
if (print)
std::cout << "scenario 4: Default layout; view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- "
......@@ -266,7 +327,9 @@ void scenario_4(double* array, int nblocks, int nlev, int nproma, bool print = t
Kokkos::deep_copy(view, d_view);
validate(array, nblocks, nlev, nproma);
}
#endif
#if 0
void scenario_4b(double* array, int nblocks, int nlev, int nproma, bool print = true) {
if (print)
std::cout << "scenario 4b (Dmitry's solution): view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- "
......@@ -298,6 +361,7 @@ void scenario_4b(double* array, int nblocks, int nlev, int nproma, bool print =
Kokkos::deep_copy(view, d_view_tmp);
validate(array, nblocks, nlev, nproma);
}
#endif
#if defined(DEMO_DEVICE)
using Layout = Kokkos::LayoutLeft;
......@@ -327,7 +391,7 @@ template <class T> struct LoopFunctor {
}
};
#if 0
void scenario_5(double* array, int nblocks, int nlev, int nproma, bool print = true) {
if (print) std::cout << "scenario 5: Adaptable layout & functor & subview (array, nproma, nlev);" << std::endl;
......@@ -347,12 +411,14 @@ void scenario_5(double* array, int nblocks, int nlev, int nproma, bool print = t
Kokkos::deep_copy(view, d_view);
validate(array, nblocks, nlev, nproma);
}
#endif
int KOKKOS_FUNCTION get_indexes(int nproma) {
return nproma + 1 ;
}
#if 0
void scenario_6(double* array, int nblocks, int nlev, int nproma, bool print = true) {
if (print) std::cout << "scenario 6: Adaptable Layout & Hierarchical parallelism" << std::endl;
......@@ -391,7 +457,9 @@ void scenario_6(double* array, int nblocks, int nlev, int nproma, bool print = t
Kokkos::deep_copy(view, d_view);
validate(array, nblocks, nlev, nproma);
}
#endif
#if 0
void scenario_7(double* array, int nblocks, int nlev, int nproma, bool print=true) {
if(print)
......@@ -420,182 +488,126 @@ void scenario_7(double* array, int nblocks, int nlev, int nproma, bool print=tru
if(print)
printf("Time = %f ms\n\n", timer.seconds() * 1000);
Kokkos::deep_copy(view, d_view);
validate(array, nblocks, nlev, nproma);
}
void scenario_7b(double* array, int nblocks, int nlev, int nproma) {
std::cout << "scenario 7b: +ACC Default layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- " << std::endl;
Kokkos::View<double***, Kokkos::MemoryUnmanaged> d_view(array, nblocks, nlev, nproma);
timer.reset();
Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nblocks), KOKKOS_LAMBDA (const int jb) {
//for (int jb = 0 ; jb < nblocks; ++jb) {
for (int jc = 0 ; jc < nproma; ++jc) {
for (int jk = 0; jk < nlev; ++jk) {
#if defined(DEMO_DEVICE)
int p = jc * nlev * nblocks + jk * nblocks + jb;
#else
int p = jb * nlev * nproma + jk * nproma + jc;
#endif
check_bounds(jb,jk,jc, d_view.extent(0), d_view.extent(1), d_view.extent(2));
d_view(jb, jk, jc) = p;
// printf("%f ", d_view(jb, jk, jc));
}
}
});
Kokkos::fence();
printf("Time = %f ms\n\n", timer.seconds() * 1000);
}
#define outer_lambda [=]
#define inner_lambda KOKKOS_LAMBDA
void scenario_7b(double* array, int nblocks, int nlev, int nproma) {
std::cout << "scenario 7b: +ACC modifiedDefault layout; CPUview(array, blocks, nlev, nproma); GPUview(nproma, nlev, nblocks), parallel blocks ----- " << std::endl;
#ifdef DEMO_DEVICE
# define outer_for(STR, START, END, LAMBDA) for(int jb=START; jb<END; ++jb) {LAMBDA(jb);};
# define inner_for(STR, START, END, LAMBDA) Kokkos::parallel_for(STR, Kokkos::RangePolicy<>(START, END), LAMBDA)
Kokkos::View<double***, Kokkos::MemoryUnmanaged> d_view(array, nproma, nlev, nblocks);
#else
# define outer_for(STR, START, END, LAMBDA) Kokkos::parallel_for(STR, Kokkos::RangePolicy<>(START, END), LAMBDA)
# define inner_for(STR, START, END, LAMBDA) for(int jc=START; jc<END; ++jc) {LAMBDA(jc);};
Kokkos::View<double***, Kokkos::MemoryUnmanaged> d_view(array, nblocks, nlev, nproma);
#endif
timer.reset();
Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nblocks), KOKKOS_LAMBDA (const int jb) {
for (int jc = 0 ; jc < nproma; ++jc) {
#ifdef DEMO_DEVICE
update_field(d_view, jc, all_levels, jb);
#else
update_field(d_view, jb, all_levels, jc);
#endif
}
});
Kokkos::fence();
printf("Time = %f ms\n\n", timer.seconds() * 1000);
}
void scenario_7bmacro(double* array, int nblocks, int nlev, int nproma) {
std::cout << "scenario 7bmacro: +ACC CPU:LayoutLeft GPU:LayoutLeft; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- " << std::endl;
std::cout << "scenario 7bmacro: +ACC CPU:LayoutLeft GPU:LayoutRight; CPUview(array, nblocks, nlev, nproma); GPUview(nproma, nlev, nblocks) parallel: asICON----- " << std::endl;
#if defined(DEMO_DEVICE)
Kokkos::View<double***, Kokkos::LayoutRight, Kokkos::MemoryUnmanaged> d_view(array, nblocks, nlev, nproma);
Kokkos::View<double***, Kokkos::LayoutLeft, Kokkos::MemoryUnmanaged> d_view(array, nproma, nlev, nblocks);
#else
Kokkos::View<double***, Kokkos::LayoutRight, Kokkos::MemoryUnmanaged> d_view(array, nblocks, nlev, nproma);
Kokkos::View<double***, Kokkos::LayoutRight, Kokkos::MemoryUnmanaged> d_view(array, nblocks, nlev, nproma);
#endif
timer.reset();
outer_for("", 0, nblocks, outer_lambda(const int jb){
inner_for("",0, nproma, inner_lambda(const int jc) {
for (int jk = 0; jk < nlev; ++jk) {
int p = jb * nlev * nproma + jk * nproma + jc;
d_view(jb, jk, jc) = p;
}
});
});
Kokkos::fence();
printf("Time = %f ms\n\n", timer.seconds() * 1000);
show_view("7bmacro",d_view);
timer.reset();
outer_for("", 0, nblocks, outer_lambda(const int jb){
inner_for("",0, nproma, inner_lambda(const int jc) {
#ifdef DEMO_DEVICE
update_field(d_view, jc, all_levels, jb);
#else
update_field(d_view, jb, all_levels, jc);
#endif
});
});
Kokkos::fence();
printf("Time = %f ms\n\n", timer.seconds() * 1000);
}
void scenario_7btemplate(double* array, int nblocks, int nlev, int nproma) {
std::cout << "scenario 7btemplate: +ACC CPU:LayoutLeft GPU:LayoutLeft; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- " << std::endl;
std::cout << "scenario 7btemplate: +ACC allways LayoutRight; view(array, nblocks, nlev, nproma); parallel: asICON ----- " << std::endl;
#if defined(DEMO_DEVICE)
Kokkos::View<double***, Kokkos::LayoutRight, Kokkos::MemoryUnmanaged> d_view(array, nblocks, nlev, nproma);
#else
Kokkos::View<double***, Kokkos::LayoutRight, Kokkos::MemoryUnmanaged> d_view(array, nblocks, nlev, nproma);
#endif
timer.reset();
outer_loop(nblocks, outer_lambda(const int jb){
inner_loop(nproma, inner_lambda(const int jc) {
for (int jk = 0; jk < nlev; ++jk) {
int p = jb * nlev * nproma + jk * nproma + jc;
d_view(jb, jk, jc) = p;
}
});
});
timer.reset();
outer_loop(nblocks, outer_lambda(const int jb){
inner_loop(nproma, inner_lambda(const int jc) {
update_field(d_view, jb, all_levels, jc);
});
});
Kokkos::fence();
printf("Time = %f ms\n\n", timer.seconds() * 1000);
}
void scenario_8(double* array, int nblocks, int nlev, int nproma) {
std::cout << "scenario 8: +ACC Default Layout as ICON pointer view(array, nproma, nlev, nblocks); d_view(jb, jk, jc) ----- " << std::endl;
using space_t = Kokkos::DefaultExecutionSpace::memory_space;
Kokkos::View<double***, Kokkos::MemoryUnmanaged> d_view(array, nproma, nlev, nblocks);
std::cout << "scenario 8: +ACC always LayoutLeft view(array, nproma, nlev, nblocks) parallel: nproma ----- " << std::endl;
Kokkos::View<double***, Kokkos::LayoutLeft, Kokkos::MemoryUnmanaged> d_view(array, nproma, nlev, nblocks) ;
show_view("scenario_8", d_view);
timer.reset();
for (int jb = 0; jb < nblocks; ++jb)
Kokkos::parallel_for(
"", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA(const int jc) {
for (int jk = 0; jk < nlev; ++jk) {
#if defined(DEMO_DEVICE)
int p = jb * nlev * nproma + jk * nproma + jc;
#else
int p = jc * nlev * nblocks + jk * nblocks + jb;
#endif
d_view(jc, jk, jb) = p;
// printf("%f ", d_view(jb, jk, jc));
}
Kokkos::parallel_for("", Kokkos::RangePolicy<>(0, nproma), KOKKOS_LAMBDA(const int jc) {
update_field(d_view, jc, all_levels, jb);
});
Kokkos::fence();
printf("Time = %f ms\n\n", timer.seconds() * 1000);
}
void scenario_8macro(double* array, int nblocks, int nlev, int nproma) {
std::cout << "scenario 8macro: +ACC CPU:LayoutLeft GPU:LayoutLeft as ICON pointer view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- " << std::endl;
std::cout << "scenario 8macro: +ACC always LayoutLeftm, view(array, nproma, nlev, nblocks); parallel: asICON ----- " << std::endl;
using space_t = Kokkos::DefaultExecutionSpace::memory_space;
#if defined(DEMO_DEVICE)
Kokkos::View<double***, Kokkos::LayoutLeft> d_view(array, nproma, nlev, nblocks);
#else
Kokkos::View<double***, Kokkos::LayoutLeft, Kokkos::MemoryUnmanaged> d_view(array, nproma, nlev, nblocks);
#endif
timer.reset();
outer_for("", 0, nblocks, outer_lambda(const int jb) {
inner_for("",0, nproma, inner_lambda(const int jc) {
for (int jk = 0; jk < nlev; ++jk) {
int p = jb * nlev * nproma + jk * nproma + jc;
d_view(jc, jk, jb) = p;
// printf("%f ", d_view(jb, jk, jc));
}
update_field(d_view, jc, all_levels, jb);
});
});
Kokkos::fence();
printf("Time = %f ms\n\n", timer.seconds() * 1000);
}
void scenario_8template(double* array, int nblocks, int nlev, int nproma) {
std::cout << "scenario 8template: +ACC CPU:LayoutLeft GPU:LayoutLeft as ICON pointer view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- " << std::endl;
std::cout << "scenario 8template: +ACC always LayoutLeft, view(array, nproma, nlev, nblocks); parallel: asICON ----- " << std::endl;
using space_t = Kokkos::DefaultExecutionSpace::memory_space;
#if defined(DEMO_DEVICE)
Kokkos::View<double***, Kokkos::LayoutLeft> d_view(array, nproma, nlev, nblocks);
#else
Kokkos::View<double***, Kokkos::LayoutLeft, Kokkos::MemoryUnmanaged> d_view(array, nproma, nlev, nblocks);
#endif
timer.reset();
outer_loop(nblocks, outer_lambda(const int jb) {
inner_loop(nproma, inner_lambda(const int jc) {
for (int jk = 0; jk < nlev; ++jk) {
int p = jb * nlev * nproma + jk * nproma + jc;
d_view(jc, jk, jb) = p;
// printf("%f ", d_view(jb, jk, jc));
}
update_field(d_view, jc, all_levels, jb);
});
});
Kokkos::fence();
printf("Time = %f ms\n\n", timer.seconds() * 1000);
}
void openacc_calls(double* array, int nblocks, int nlev, int nproma,
void openacc_calls(double* array, double *ref, int nblocks, int nlev, int nproma,
std::function<void(double*, int, int, int)> func){
#if defined(DEMO_DEVICE)
#if defined(DEMO_DEVICE)
#pragma acc enter data copyin(array[0:nblocks*nlev*nproma])
{
#pragma acc host_data use_device(array)
......@@ -604,64 +616,123 @@ void openacc_calls(double* array, int nblocks, int nlev, int nproma,
}
}
#pragma acc exit data copyout(array[0:nblocks*nlev*nproma])
#else
#else
func(array, nblocks, nlev, nproma);
#endif
validate(array, nblocks, nlev, nproma);
#endif
validate(array, ref, nblocks, nlev, nproma);
}
void init_array(double *array, int nproma, int nlev, int nblocks) {
Kokkos::View<double***, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks);
for (int jb=0; jb < nblocks; ++jb) {
for (int jk=0; jk < nlev; ++jk) {
for (int jc=0; jc < nproma; ++jc) {
view(jc,jk,jb) = jc + jk*nproma + jb * nproma*nblocks;
}
}
}
}
void physop(double *array, int nproma, int nlev, int nblocks) {
Kokkos::View<double***, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryUnmanaged> view(array, nproma, nlev, nblocks);
for (int jb=0; jb < nblocks; ++jb) {
for (int jc=0; jc < nproma; ++jc) {
update_field(view, jc, all_levels, jb);
}
}
#if 0
for (int jb=0; jb < nblocks; ++jb) {
for (int jc=0; jc < nproma; ++jc) {
for (int jk=0; jk < nlev; ++jk) {
printf("physop: jc=%d, jk=%d, jk=%d, view=%f\n",jc,jk,jb,view(jc,jk,jb));
}
}
}
#endif
}
int main() {
int ncells = atoi(std::getenv("NCELLS"));
int64_t ncells64 = atoi(std::getenv("NCELLS"));
assert(ncells64 < INT_MAX);
int64_t nproma64 = atoi(std::getenv("NPROMA"));
assert(nproma64 < INT_MAX);
int ncells = ncells64;
int nlev = atoi(std::getenv("NLEV"));
int nproma = atoi(std::getenv("NPROMA"));
int nproma = nproma64;
int nblocks = (ncells - 1) / nproma + 1;
assert(nproma*nblocks == ncells);
std::cout << "nblocks=" << nblocks << ", nlev=" << nlev << ", nproma=" << nproma << std::endl;
double array_out_ref[nblocks * nlev * nproma];
double array_in_ref[nblocks * nlev * nproma];
double array[nblocks * nlev * nproma];
const size_t nbytes = nproma*nlev*nblocks*sizeof(double);
Kokkos::initialize();
{
init_array(array_in_ref, nproma,nlev,nblocks);
memcpy(array_out_ref, array_in_ref, nbytes);
physop(array_out_ref, nproma,nlev,nblocks);
memcpy(array, array_in_ref, nbytes);
scenario_1(array, nblocks, nlev, nproma, false);
//memcpy(array, array_in_ref, nbytes);
//scenario_1(array, nblocks, nlev, nproma);
memset(array, 0.0, sizeof(array));
//validate(array, array_out_ref, nblocks, nlev, nproma);
memcpy(array, array_in_ref, nbytes);
std::function<void(double*, int, int, int)> s_1b = scenario_1b;
openacc_calls(array, nblocks, nlev, nproma, s_1b);
memset(array, 0.0, sizeof(array));
openacc_calls(array, array_out_ref, nblocks, nlev, nproma, s_1b);
#if 0
memcpy(array, array_in_ref, nbytes);
std::function<void(double*, int, int, int)> s_1c = scenario_1c;
openacc_calls(array, nblocks, nlev, nproma, s_1c);
openacc_calls(array, array_out_ref, nblocks, nlev, nproma, s_1c);
#endif
memset(array, 0.0, sizeof(array));
#if 0
std::function<void(double*, int, int, int)> s_1cmacro = scenario_1cmacro;
openacc_calls(array, nblocks, nlev, nproma, s_1cmacro);
memset(array, 0.0, sizeof(array));
#endif
std::function<void(double*, int, int, int)> s_7b = scenario_7b;
openacc_calls(array, nblocks, nlev, nproma, s_7b);
openacc_calls(array, array_out_ref, nblocks, nlev, nproma, s_7b);
memset(array, 0.0, sizeof(array));
memcpy(array, array_in_ref, nbytes);
std::function<void(double*, int, int, int)> s_7bmacro = scenario_7bmacro;
openacc_calls(array, nblocks, nlev, nproma, s_7bmacro);
openacc_calls(array, array_out_ref, nblocks, nlev, nproma, s_7bmacro);
memset(array, 0.0, sizeof(array));
memcpy(array, array_in_ref, nbytes);
std::function<void(double*, int, int, int)> s_7btemplate = scenario_7btemplate;
openacc_calls(array, nblocks, nlev, nproma, s_7btemplate);
openacc_calls(array, array_out_ref, nblocks, nlev, nproma, s_7btemplate);
memset(array, 0.0, sizeof(array));
memcpy(array, array_in_ref, nbytes);
std::function<void(double*, int, int, int)> s_8 = scenario_8;
openacc_calls(array, nblocks, nlev, nproma, s_8);
openacc_calls(array, array_out_ref, nblocks, nlev, nproma, s_8);
memset(array, 0.0, sizeof(array));
memcpy(array, array_in_ref, nbytes);
std::function<void(double*, int, int, int)> s_8macro = scenario_8macro;
openacc_calls(array, nblocks, nlev, nproma, s_8macro);
openacc_calls(array, array_out_ref, nblocks, nlev, nproma, s_8macro);
memset(array, 0.0, sizeof(array));
memcpy(array, array_in_ref, nbytes);
std::function<void(double*, int, int, int)> s_8template = scenario_8template;
openacc_calls(array, nblocks, nlev, nproma, s_8template);
openacc_calls(array, array_out_ref, nblocks, nlev, nproma, s_8template);
// exit(0);
/*
scenario_2(array, nblocks, nlev, nproma);
scenario_2b(array, nblocks, nlev, nproma);
......@@ -670,7 +741,9 @@ int main() {
scenario_4b(array, nblocks, nlev, nproma);
scenario_5(array, nblocks, nlev, nproma);
scenario_6(array, nblocks, nlev, nproma);
scenario_7(array, nblocks, nlev, nproma);
validate(array, array_out_ref, nblocks, nlev, nproma);
*/
}
......
......@@ -14,15 +14,15 @@ then
cmake -B build_gpu -S . -DMU_ARCH=a100 -DCMAKE_CXX_FLAGS="-O3"
cmake --build build_gpu --parallel
ncells=(5000000)
ncells=(5000064)
nlev=(90)
nproma=(5000000)
nproma=(5000064)
else
#rm -rf build
cmake -B build -S . -DMU_ARCH=x86_64 -DCMAKE_CXX_FLAGS="-O3"
cmake --build build --parallel
ncells=(5000000)
ncells=(5000064)
nlev=(90)
nproma=(32 64 96 128)
......@@ -42,6 +42,7 @@ for jb in ${ncells[*]}; do
./build_gpu/demo
else
./build/demo
echo "---"
fi
done
done
......
#source this
module load gcc nvhpc
LD_LIBRARY_PATH=/sw/spack-levante/gcc-11.2.0-bcn7mb/lib/gcc/x86_64-pc-linux-gnu/11.2.0/../../../../lib64/:$LD_LIBRARY_PATH