Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
D
demo
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Releases
Package registry
Model registry
Operate
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Georgiana Mania
demo
Compare revisions
2ee992ae92e2eb0bba4545b643e2c8e974860900 to 047f3330605919f45a91165fe3736cf8f871d2c2
Compare revisions
Changes are shown as if the
source
revision was being merged into the
target
revision.
Learn more about comparing revisions.
Source
k202174/demo
Select target project
No results found
047f3330605919f45a91165fe3736cf8f871d2c2
Select Git revision
Swap
Target
k202174/demo
Select target project
k202174/demo
1 result
2ee992ae92e2eb0bba4545b643e2c8e974860900
Select Git revision
Show changes
Only incoming changes from source
Include changes to target since source was created
Compare
Commits on Source (2)
Disable 1c use case (2d view no longer works)
· 60372a86
Panos Adamidis
authored
1 month ago
and
Joerg Behrens
committed
1 month ago
60372a86
cleanup and debug support + better test
· 047f3330
Joerg Behrens
authored
1 month ago
047f3330
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
debug_script.sh
+49
-0
49 additions, 0 deletions
debug_script.sh
icon_loops.hpp
+3
-0
3 additions, 0 deletions
icon_loops.hpp
main.cpp
+240
-167
240 additions, 167 deletions
main.cpp
script.sh
+4
-3
4 additions, 3 deletions
script.sh
set_env.sh
+5
-0
5 additions, 0 deletions
set_env.sh
with
301 additions
and
170 deletions
debug_script.sh
0 → 100755
View file @
047f3330
#!/bin/bash
#gcc
#nvhpc/24.7-gcc-11.2.0
#export LD_LIBRARY_PATH
ulimit
-s
unlimited
set
-e
if
[
"
$1
"
==
'gpu'
]
then
#rm -rf build_gpu
cmake
-B
build_gpu
-S
.
-DMU_ARCH
=
a100
-DCMAKE_CXX_FLAGS
=
"-O0 -g"
cmake
--build
build_gpu
--parallel
ncells
=(
8
)
nlev
=(
3
)
nproma
=(
2
)
# more than one block to check correctness
else
#rm -rf build
cmake
-B
build
-S
.
-DMU_ARCH
=
x86_64
-DCMAKE_CXX_FLAGS
=
"-O0 -g"
cmake
--build
build
--parallel
ncells
=(
8
)
nlev
=(
3
)
nproma
=(
2
)
#(32 64 96 128)
export
OMP_PROC_BIND
=
close
export
OMP_PLACES
=
cores
export
OMP_NUM_THREADS
=
8
fi
for
jb
in
${
ncells
[*]
}
;
do
for
jk
in
${
nlev
[*]
}
;
do
for
jc
in
${
nproma
[*]
}
;
do
export
NPROMA
=
$jc
export
NLEV
=
$jk
export
NCELLS
=
$jb
if
[
"
$1
"
==
'gpu'
]
then
./build_gpu/demo
else
./build/demo
fi
echo
"---"
done
done
done
This diff is collapsed.
Click to expand it.
icon_loops.hpp
View file @
047f3330
...
...
@@ -3,6 +3,9 @@
#include
<Kokkos_Core.hpp>
#define outer_lambda [=]
#define inner_lambda KOKKOS_LAMBDA
namespace
icon_loops
{
constexpr
bool
exec_on_device
()
{
...
...
This diff is collapsed.
Click to expand it.
main.cpp
View file @
047f3330
...
...
@@ -3,6 +3,7 @@
#include
<cassert>
#include
<iostream>
#include
<utility>
#include
<cstring>
#include
"Kokkos_Timer.hpp"
#include
"icon_loops.hpp"
...
...
@@ -12,32 +13,96 @@ using space_t = Kokkos::DefaultExecutionSpace::memory_space;
typedef
Kokkos
::
MDRangePolicy
<
Kokkos
::
DefaultExecutionSpace
,
Kokkos
::
IndexType
<
int
>
,
Kokkos
::
Rank
<
3
>>
md_range_policy
;
Kokkos
::
Timer
timer
;
//Kokkos::Timer transposition_timer;
//Kokkos::Timer merge_horizontal_timer;
// constexpr int nblocks = 2;
// constexpr int nlev = 90;
// constexpr int nproma = 55000;
//#define ENABLE_CHECK_BOUNDS
//#define ENABLE_CHECK_BOUNDS_2D
#define ENABLE_CHECK_BOUNDS
static
void
validate
(
double
*
array
,
int
nblocks
,
int
nlev
,
int
nproma
)
{
struct
AllLevels
{
constexpr
static
int
value
=
0
;
};
constexpr
AllLevels
all_levels
;
#if defined(DEMO_DEVICE)
# define HOST_DEVICE_ATTRIBUTES __host__ __device__
#else
# define HOST_DEVICE_ATTRIBUTES
#endif
//macro approach:
#ifdef DEMO_DEVICE
# define outer_for(STR, START, END, LAMBDA) for(int jb=START; jb<END; ++jb) {LAMBDA(jb);};
# define inner_for(STR, START, END, LAMBDA) Kokkos::parallel_for(STR, Kokkos::RangePolicy<>(START, END), LAMBDA)
#else
# define outer_for(STR, START, END, LAMBDA) Kokkos::parallel_for(STR, Kokkos::RangePolicy<>(START, END), LAMBDA)
# define inner_for(STR, START, END, LAMBDA) for(int jc=START; jc<END; ++jc) {LAMBDA(jc);};
#endif
static
void
validate
(
double
*
array
,
double
*
ref
,
int
nblocks
,
int
nlev
,
int
nproma
)
{
for
(
int
i
=
0
;
i
<
nblocks
*
nlev
*
nproma
;
++
i
)
{
assert
(
array
[
i
]
==
static_cast
<
double
>
(
i
)
);
assert
(
array
[
i
]
==
ref
[
i
]
);
}
}
KOKKOS_INLINE_FUNCTION
void
check_bounds
(
int
i1
,
int
i2
,
int
i3
,
int
n1
,
int
n2
,
int
n3
)
{
KOKKOS_INLINE_FUNCTION
void
check_bounds
(
int
i0
,
int
i1
,
int
i2
,
int
n0
,
int
n1
,
int
n2
)
{
#ifdef ENABLE_CHECK_BOUNDS
assert
(
i1
>=
0
&&
i2
>=
0
&&
i
3
>=
0
&&
i1
<
n
1
&&
i
2
<
n
2
&&
i
3
<
n
3
);
assert
(
i0
>=
0
&&
i1
>=
0
&&
i2
>=
0
&&
i
0
<
n
0
&&
i
1
<
n
1
&&
i
2
<
n
2
);
#endif
}
KOKKOS_INLINE_FUNCTION
void
check_bounds
_2d
(
int
i
1
,
int
i
2
,
int
n
1
,
int
n
2
)
{
#ifdef ENABLE_CHECK_BOUNDS
_2D
assert
(
i
1
>=
0
&&
i
2
>=
0
&&
i
1
<
n
1
&&
i
2
<
n
2
);
KOKKOS_INLINE_FUNCTION
void
check_bounds
(
int
i
0
,
int
i
1
,
int
n
0
,
int
n
1
)
{
#ifdef ENABLE_CHECK_BOUNDS
assert
(
i
0
>=
0
&&
i
1
>=
0
&&
i
0
<
n
0
&&
i
1
<
n
1
);
#endif
}
template
<
typename
ViewType
>
void
show_view
(
const
std
::
string
&
label
,
const
ViewType
&
view
,
const
bool
with_values
=
false
)
{
printf
(
"show_view: label=%s, "
,
label
.
c_str
());
using
MyLayout
=
typename
ViewType
::
array_layout
;
if
(
std
::
is_same
<
MyLayout
,
Kokkos
::
LayoutLeft
>::
value
)
{
printf
(
"Layout=LayoutLeft
\n
"
);
}
else
if
(
std
::
is_same
<
MyLayout
,
Kokkos
::
LayoutRight
>::
value
)
{
printf
(
"Layout=LayoutRight
\n
"
);
}
else
{
printf
(
"Layout: **unknown**
\n
"
);
}
if
(
with_values
)
{
#ifdef DEMO_DEVICE
auto
h_view
=
Kokkos
::
create_mirror_view_and_copy
(
Kokkos
::
Serial
(),
view
);
#else
auto
&
h_view
=
view
;
#endif
const
int
n0
=
h_view
.
extent
(
0
);
const
int
n1
=
h_view
.
extent
(
1
);
const
int
n2
=
h_view
.
extent
(
2
);
printf
(
"n0=%d, n1=%d, n2=%d
\n
"
,
n0
,
n1
,
n2
);
for
(
int
i0
=
0
;
i0
<
n0
;
++
i0
)
{
for
(
int
i2
=
0
;
i2
<
n2
;
++
i2
)
{
for
(
int
i1
=
0
;
i1
<
n1
;
++
i1
)
{
printf
(
"show_view: i0=%d, i1=%d, i2=%d, v=%f
\n
"
,
i0
,
i1
,
i2
,
h_view
(
i0
,
i1
,
i2
));
}
};
};
}
Kokkos
::
fence
();
}
template
<
typename
ViewType
>
inline
HOST_DEVICE_ATTRIBUTES
void
update_field
(
ViewType
&
v
,
int
i0
,
AllLevels
all
,
int
i2
)
{
const
int
nlev
=
v
.
extent
(
1
);
check_bounds
(
i0
,
i2
,
v
.
extent
(
0
),
v
.
extent
(
2
));
for
(
int
k
=
1
;
k
<
nlev
-
1
;
++
k
)
{
v
(
i0
,
k
,
i2
)
=
v
(
i0
,
k
,
i2
)
+
v
(
i0
,
k
-
1
,
i2
)
/
100
+
v
(
i0
,
k
+
1
,
i2
)
/
10000
;
}
}
void
scenario_1
(
double
*
array
,
int
nblocks
,
int
nlev
,
int
nproma
,
bool
print
=
true
)
{
if
(
print
)
std
::
cout
<<
"scenario 1: Default layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- "
...
...
@@ -53,11 +118,7 @@ void scenario_1(double* array, int nblocks, int nlev, int nproma, bool print = t
for
(
int
jb
=
0
;
jb
<
nblocks
;
++
jb
)
Kokkos
::
parallel_for
(
""
,
Kokkos
::
RangePolicy
<>
(
0
,
nproma
),
KOKKOS_LAMBDA
(
const
int
jc
)
{
for
(
int
jk
=
0
;
jk
<
nlev
;
++
jk
)
{
int
p
=
jb
*
nlev
*
nproma
+
jk
*
nproma
+
jc
;
check_bounds
(
jb
,
jk
,
jc
,
d_view
.
extent
(
0
),
d_view
.
extent
(
1
),
d_view
.
extent
(
2
));
d_view
(
jb
,
jk
,
jc
)
=
p
;
}
update_field
(
d_view
,
jb
,
all_levels
,
jc
);
});
Kokkos
::
fence
();
...
...
@@ -65,11 +126,12 @@ void scenario_1(double* array, int nblocks, int nlev, int nproma, bool print = t
if
(
print
)
printf
(
"Time = %f ms
\n\n
"
,
timer
.
seconds
()
*
1000
);
Kokkos
::
deep_copy
(
view
,
d_view
);
validate
(
array
,
nblocks
,
nlev
,
nproma
);
//
validate(array, nblocks, nlev, nproma);
}
void
scenario_1b
(
double
*
array
,
int
nblocks
,
int
nlev
,
int
nproma
)
{
std
::
cout
<<
"scenario 1b:
Default l
ayout
r
ight; view(array, nblocks, nlev, nproma);
d_view(jb, jk, jc)
----- "
std
::
cout
<<
"scenario 1b:
always L
ayout
R
ight; view(array, nblocks, nlev, nproma);
parallel: nproma
----- "
<<
std
::
endl
;
Kokkos
::
View
<
double
***
,
Kokkos
::
LayoutRight
,
Kokkos
::
MemoryUnmanaged
>
d_view
(
array
,
nblocks
,
nlev
,
nproma
);
...
...
@@ -79,11 +141,7 @@ void scenario_1b(double* array, int nblocks, int nlev, int nproma) {
for
(
int
jb
=
0
;
jb
<
nblocks
;
++
jb
)
Kokkos
::
parallel_for
(
""
,
Kokkos
::
RangePolicy
<>
(
0
,
nproma
),
KOKKOS_LAMBDA
(
const
int
jc
)
{
for
(
int
jk
=
0
;
jk
<
nlev
;
++
jk
)
{
int
p
=
jb
*
nlev
*
nproma
+
jk
*
nproma
+
jc
;
check_bounds
(
jb
,
jk
,
jc
,
d_view
.
extent
(
0
),
d_view
.
extent
(
1
),
d_view
.
extent
(
2
));
d_view
(
jb
,
jk
,
jc
)
=
p
;
}
update_field
(
d_view
,
jb
,
all_levels
,
jc
);
});
Kokkos
::
fence
();
...
...
@@ -92,33 +150,30 @@ void scenario_1b(double* array, int nblocks, int nlev, int nproma) {
}
#if 0
void scenario_1c(double* array, int nblocks, int nlev, int nproma) {
std::cout << "scenario 1c: Default layout; view(array, ncells, nlev); d_view(jc, jk) ----- "
<< std::endl;
int ncells = nblocks*nproma;
Kokkos::View<double**, Kokkos::MemoryUnmanaged> d_view(array, ncells, nlev);
Kokkos::View<double**> d_view2d("d_view2d", ncells, nlev);
timer.reset();
merge_horizontal(d_view2d, d_view);
Kokkos::parallel_for(
"", Kokkos::RangePolicy<>(0, ncells), KOKKOS_LAMBDA(const int jc) {
for (int jk = 0; jk < nlev; ++jk) {
#if defined(DEMO_DEVICE)
int
p
=
jk
*
ncells
+
jc
;
#else
int
p
=
jc
*
nlev
+
jk
;
#endif
check_bounds_2d
(
jc
,
jk
,
d_view
.
extent
(
0
),
d_view
.
extent
(
1
));
d_view
(
jc
,
jk
)
=
p
;
check_bounds(jc, jk, d_view.extent(0), d_view.extent(1));
d_view2d(jc,jk) = d_view2d(jc,jk) + d_view2d(jc,jk-1)/100 + d_view2d(jc,jk+1)*0.25;
}
});
Kokkos::fence();
printf("Time = %f ms\n\n", timer.seconds() * 1000);
}
#endif
#if 0
void scenario_1cmacro(double* array, int nblocks, int nlev, int nproma) {
std::cout << "scenario 1cmacro: +ACC CPU:LayoutLeft GPU:LayoutLeft; as ICON pointer view(array, ncells, nlev); d_view(jc, jk) ----- " << std::endl;
...
...
@@ -146,7 +201,7 @@ void scenario_1cmacro(double* array, int nblocks, int nlev, int nproma) {
printf
(
"Time = %f ms
\n\n
"
,
timer
.
seconds
()
*
1000
);
}
#endif
void
scenario_2
(
double
*
array
,
int
nblocks
,
int
nlev
,
int
nproma
,
bool
print
=
true
)
{
if
(
print
)
...
...
@@ -175,9 +230,11 @@ void scenario_2(double* array, int nblocks, int nlev, int nproma, bool print = t
Kokkos
::
fence
();
if
(
print
)
printf
(
"Time = %f ms
\n\n
"
,
timer
.
seconds
()
*
1000
);
Kokkos
::
deep_copy
(
view
,
d_view
);
validate
(
array
,
nblocks
,
nlev
,
nproma
);
//
validate(array, nblocks, nlev, nproma);
}
#endif
#if 0
void scenario_2b(double* array, int nblocks, int nlev, int nproma, bool print = true) {
if (print)
std::cout << "scenario 2b: Right 2b layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- "
...
...
@@ -208,7 +265,9 @@ void scenario_2b(double* array, int nblocks, int nlev, int nproma, bool print =
Kokkos::deep_copy(view, d_view);
validate(array, nblocks, nlev, nproma);
}
#endif
#if 0
// slow on CPU
void scenario_3(double* array, int nblocks, int nlev, int nproma, bool print = true) {
if (print)
...
...
@@ -238,7 +297,9 @@ void scenario_3(double* array, int nblocks, int nlev, int nproma, bool print = t
Kokkos::deep_copy(view, d_view);
validate(array, nblocks, nlev, nproma);
}
#endif
#if 0
void scenario_4(double* array, int nblocks, int nlev, int nproma, bool print = true) {
if (print)
std::cout << "scenario 4: Default layout; view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- "
...
...
@@ -266,7 +327,9 @@ void scenario_4(double* array, int nblocks, int nlev, int nproma, bool print = t
Kokkos::deep_copy(view, d_view);
validate(array, nblocks, nlev, nproma);
}
#endif
#if 0
void scenario_4b(double* array, int nblocks, int nlev, int nproma, bool print = true) {
if (print)
std::cout << "scenario 4b (Dmitry's solution): view(array, nproma, nlev, nblocks); d_view(jc, jk, jb) ----- "
...
...
@@ -298,6 +361,7 @@ void scenario_4b(double* array, int nblocks, int nlev, int nproma, bool print =
Kokkos::deep_copy(view, d_view_tmp);
validate(array, nblocks, nlev, nproma);
}
#endif
#if defined(DEMO_DEVICE)
using
Layout
=
Kokkos
::
LayoutLeft
;
...
...
@@ -327,7 +391,7 @@ template <class T> struct LoopFunctor {
}
};
#if 0
void scenario_5(double* array, int nblocks, int nlev, int nproma, bool print = true) {
if (print) std::cout << "scenario 5: Adaptable layout & functor & subview (array, nproma, nlev);" << std::endl;
...
...
@@ -347,12 +411,14 @@ void scenario_5(double* array, int nblocks, int nlev, int nproma, bool print = t
Kokkos::deep_copy(view, d_view);
validate(array, nblocks, nlev, nproma);
}
#endif
int
KOKKOS_FUNCTION
get_indexes
(
int
nproma
)
{
return
nproma
+
1
;
}
#if 0
void scenario_6(double* array, int nblocks, int nlev, int nproma, bool print = true) {
if (print) std::cout << "scenario 6: Adaptable Layout & Hierarchical parallelism" << std::endl;
...
...
@@ -391,7 +457,9 @@ void scenario_6(double* array, int nblocks, int nlev, int nproma, bool print = t
Kokkos
::
deep_copy
(
view
,
d_view
);
validate
(
array
,
nblocks
,
nlev
,
nproma
);
}
#endif
#if 0
void scenario_7(double* array, int nblocks, int nlev, int nproma, bool print=true) {
if(print)
...
...
@@ -420,182 +488,126 @@ void scenario_7(double* array, int nblocks, int nlev, int nproma, bool print=tru
if(print)
printf("Time = %f ms\n\n", timer.seconds() * 1000);
Kokkos::deep_copy(view, d_view);
validate
(
array
,
nblocks
,
nlev
,
nproma
);
}
void
scenario_7b
(
double
*
array
,
int
nblocks
,
int
nlev
,
int
nproma
)
{
std
::
cout
<<
"scenario 7b: +ACC Default layout; view(array, nblocks, nlev, nproma); d_view(jb, jk, jc) ----- "
<<
std
::
endl
;
Kokkos
::
View
<
double
***
,
Kokkos
::
MemoryUnmanaged
>
d_view
(
array
,
nblocks
,
nlev
,
nproma
);
timer
.
reset
();
Kokkos
::
parallel_for
(
""
,
Kokkos
::
RangePolicy
<>
(
0
,
nblocks
),
KOKKOS_LAMBDA
(
const
int
jb
)
{
//for (int jb = 0 ; jb < nblocks; ++jb) {
for
(
int
jc
=
0
;
jc
<
nproma
;
++
jc
)
{
for
(
int
jk
=
0
;
jk
<
nlev
;
++
jk
)
{
#if defined(DEMO_DEVICE)
int
p
=
jc
*
nlev
*
nblocks
+
jk
*
nblocks
+
jb
;
#else
int
p
=
jb
*
nlev
*
nproma
+
jk
*
nproma
+
jc
;
#endif
check_bounds
(
jb
,
jk
,
jc
,
d_view
.
extent
(
0
),
d_view
.
extent
(
1
),
d_view
.
extent
(
2
));
d_view
(
jb
,
jk
,
jc
)
=
p
;
// printf("%f ", d_view(jb, jk, jc));
}
}
});
Kokkos
::
fence
();
printf
(
"Time = %f ms
\n\n
"
,
timer
.
seconds
()
*
1000
);
}
#define outer_lambda [=]
#define inner_lambda KOKKOS_LAMBDA
void
scenario_7b
(
double
*
array
,
int
nblocks
,
int
nlev
,
int
nproma
)
{
std
::
cout
<<
"scenario 7b: +ACC modifiedDefault layout; CPUview(array, blocks, nlev, nproma); GPUview(nproma, nlev, nblocks), parallel blocks ----- "
<<
std
::
endl
;
#ifdef DEMO_DEVICE
# define outer_for(STR, START, END, LAMBDA) for(int jb=START; jb<END; ++jb) {LAMBDA(jb);};
# define inner_for(STR, START, END, LAMBDA) Kokkos::parallel_for(STR, Kokkos::RangePolicy<>(START, END), LAMBDA)
Kokkos
::
View
<
double
***
,
Kokkos
::
MemoryUnmanaged
>
d_view
(
array
,
nproma
,
nlev
,
nblocks
);
#else
# define outer_for(STR, START, END, LAMBDA) Kokkos::parallel_for(STR, Kokkos::RangePolicy<>(START, END), LAMBDA)
# define inner_for(STR, START, END, LAMBDA) for(int jc=START; jc<END; ++jc) {LAMBDA(jc);};
Kokkos
::
View
<
double
***
,
Kokkos
::
MemoryUnmanaged
>
d_view
(
array
,
nblocks
,
nlev
,
nproma
);
#endif
timer
.
reset
();
Kokkos
::
parallel_for
(
""
,
Kokkos
::
RangePolicy
<>
(
0
,
nblocks
),
KOKKOS_LAMBDA
(
const
int
jb
)
{
for
(
int
jc
=
0
;
jc
<
nproma
;
++
jc
)
{
#ifdef DEMO_DEVICE
update_field
(
d_view
,
jc
,
all_levels
,
jb
);
#else
update_field
(
d_view
,
jb
,
all_levels
,
jc
);
#endif
}
});
Kokkos
::
fence
();
printf
(
"Time = %f ms
\n\n
"
,
timer
.
seconds
()
*
1000
);
}
void
scenario_7bmacro
(
double
*
array
,
int
nblocks
,
int
nlev
,
int
nproma
)
{
std
::
cout
<<
"scenario 7bmacro: +ACC CPU:LayoutLeft GPU:Layout
Left;
view(array, nblocks, nlev, nproma);
d_
view(
jb, jk, jc)
----- "
<<
std
::
endl
;
std
::
cout
<<
"scenario 7bmacro: +ACC CPU:LayoutLeft GPU:Layout
Right; CPU
view(array, nblocks, nlev, nproma);
GPU
view(
nproma, nlev, nblocks) parallel: asICON
----- "
<<
std
::
endl
;
#if defined(DEMO_DEVICE)
Kokkos
::
View
<
double
***
,
Kokkos
::
Layout
Righ
t
,
Kokkos
::
MemoryUnmanaged
>
d_view
(
array
,
n
blocks
,
nlev
,
n
proma
);
Kokkos
::
View
<
double
***
,
Kokkos
::
Layout
Lef
t
,
Kokkos
::
MemoryUnmanaged
>
d_view
(
array
,
n
proma
,
nlev
,
n
blocks
);
#else
Kokkos
::
View
<
double
***
,
Kokkos
::
LayoutRight
,
Kokkos
::
MemoryUnmanaged
>
d_view
(
array
,
nblocks
,
nlev
,
nproma
);
Kokkos
::
View
<
double
***
,
Kokkos
::
LayoutRight
,
Kokkos
::
MemoryUnmanaged
>
d_view
(
array
,
nblocks
,
nlev
,
nproma
);
#endif
timer
.
reset
();
outer_for
(
""
,
0
,
nblocks
,
outer_lambda
(
const
int
jb
){
inner_for
(
""
,
0
,
nproma
,
inner_lambda
(
const
int
jc
)
{
for
(
int
jk
=
0
;
jk
<
nlev
;
++
jk
)
{
int
p
=
jb
*
nlev
*
nproma
+
jk
*
nproma
+
jc
;
d_view
(
jb
,
jk
,
jc
)
=
p
;
}
});
});
Kokkos
::
fence
();
printf
(
"Time = %f ms
\n\n
"
,
timer
.
seconds
()
*
1000
);
show_view
(
"7bmacro"
,
d_view
);
timer
.
reset
();
outer_for
(
""
,
0
,
nblocks
,
outer_lambda
(
const
int
jb
){
inner_for
(
""
,
0
,
nproma
,
inner_lambda
(
const
int
jc
)
{
#ifdef DEMO_DEVICE
update_field
(
d_view
,
jc
,
all_levels
,
jb
);
#else
update_field
(
d_view
,
jb
,
all_levels
,
jc
);
#endif
});
});
Kokkos
::
fence
();
printf
(
"Time = %f ms
\n\n
"
,
timer
.
seconds
()
*
1000
);
}
void
scenario_7btemplate
(
double
*
array
,
int
nblocks
,
int
nlev
,
int
nproma
)
{
std
::
cout
<<
"scenario 7btemplate: +ACC
CPU:LayoutLeft GPU:
Layout
Lef
t; view(array, nblocks, nlev, nproma);
d_view(jb, jk, jc)
----- "
<<
std
::
endl
;
std
::
cout
<<
"scenario 7btemplate: +ACC
allways
Layout
Righ
t; view(array, nblocks, nlev, nproma);
parallel: asICON
----- "
<<
std
::
endl
;
#if defined(DEMO_DEVICE)
Kokkos
::
View
<
double
***
,
Kokkos
::
LayoutRight
,
Kokkos
::
MemoryUnmanaged
>
d_view
(
array
,
nblocks
,
nlev
,
nproma
);
#else
Kokkos
::
View
<
double
***
,
Kokkos
::
LayoutRight
,
Kokkos
::
MemoryUnmanaged
>
d_view
(
array
,
nblocks
,
nlev
,
nproma
);
#endif
timer
.
reset
();
outer_loop
(
nblocks
,
outer_lambda
(
const
int
jb
){
inner_loop
(
nproma
,
inner_lambda
(
const
int
jc
)
{
for
(
int
jk
=
0
;
jk
<
nlev
;
++
jk
)
{
int
p
=
jb
*
nlev
*
nproma
+
jk
*
nproma
+
jc
;
d_view
(
jb
,
jk
,
jc
)
=
p
;
}
});
});
timer
.
reset
();
outer_loop
(
nblocks
,
outer_lambda
(
const
int
jb
){
inner_loop
(
nproma
,
inner_lambda
(
const
int
jc
)
{
update_field
(
d_view
,
jb
,
all_levels
,
jc
);
});
});
Kokkos
::
fence
();
printf
(
"Time = %f ms
\n\n
"
,
timer
.
seconds
()
*
1000
);
}
void
scenario_8
(
double
*
array
,
int
nblocks
,
int
nlev
,
int
nproma
)
{
std
::
cout
<<
"scenario 8: +ACC Default Layout as ICON pointer view(array, nproma, nlev, nblocks); d_view(jb, jk, jc) ----- "
<<
std
::
endl
;
using
space_t
=
Kokkos
::
DefaultExecutionSpace
::
memory_space
;
Kokkos
::
View
<
double
***
,
Kokkos
::
MemoryUnmanaged
>
d_view
(
array
,
nproma
,
nlev
,
nblocks
);
std
::
cout
<<
"scenario 8: +ACC always LayoutLeft view(array, nproma, nlev, nblocks) parallel: nproma ----- "
<<
std
::
endl
;
Kokkos
::
View
<
double
***
,
Kokkos
::
LayoutLeft
,
Kokkos
::
MemoryUnmanaged
>
d_view
(
array
,
nproma
,
nlev
,
nblocks
)
;
show_view
(
"scenario_8"
,
d_view
);
timer
.
reset
();
for
(
int
jb
=
0
;
jb
<
nblocks
;
++
jb
)
Kokkos
::
parallel_for
(
""
,
Kokkos
::
RangePolicy
<>
(
0
,
nproma
),
KOKKOS_LAMBDA
(
const
int
jc
)
{
for
(
int
jk
=
0
;
jk
<
nlev
;
++
jk
)
{
#if defined(DEMO_DEVICE)
int
p
=
jb
*
nlev
*
nproma
+
jk
*
nproma
+
jc
;
#else
int
p
=
jc
*
nlev
*
nblocks
+
jk
*
nblocks
+
jb
;
#endif
d_view
(
jc
,
jk
,
jb
)
=
p
;
// printf("%f ", d_view(jb, jk, jc));
}
Kokkos
::
parallel_for
(
""
,
Kokkos
::
RangePolicy
<>
(
0
,
nproma
),
KOKKOS_LAMBDA
(
const
int
jc
)
{
update_field
(
d_view
,
jc
,
all_levels
,
jb
);
});
Kokkos
::
fence
();
printf
(
"Time = %f ms
\n\n
"
,
timer
.
seconds
()
*
1000
);
}
void
scenario_8macro
(
double
*
array
,
int
nblocks
,
int
nlev
,
int
nproma
)
{
std
::
cout
<<
"scenario 8macro: +ACC
CPU:LayoutLeft GPU:LayoutLeft as ICON pointer
view(array, nproma, nlev, nblocks);
d_view(jc, jk, jb)
----- "
<<
std
::
endl
;
std
::
cout
<<
"scenario 8macro: +ACC
always LayoutLeftm,
view(array, nproma, nlev, nblocks);
parallel: asICON
----- "
<<
std
::
endl
;
using
space_t
=
Kokkos
::
DefaultExecutionSpace
::
memory_space
;
#if defined(DEMO_DEVICE)
Kokkos
::
View
<
double
***
,
Kokkos
::
LayoutLeft
>
d_view
(
array
,
nproma
,
nlev
,
nblocks
);
#else
Kokkos
::
View
<
double
***
,
Kokkos
::
LayoutLeft
,
Kokkos
::
MemoryUnmanaged
>
d_view
(
array
,
nproma
,
nlev
,
nblocks
);
#endif
timer
.
reset
();
outer_for
(
""
,
0
,
nblocks
,
outer_lambda
(
const
int
jb
)
{
inner_for
(
""
,
0
,
nproma
,
inner_lambda
(
const
int
jc
)
{
for
(
int
jk
=
0
;
jk
<
nlev
;
++
jk
)
{
int
p
=
jb
*
nlev
*
nproma
+
jk
*
nproma
+
jc
;
d_view
(
jc
,
jk
,
jb
)
=
p
;
// printf("%f ", d_view(jb, jk, jc));
}
update_field
(
d_view
,
jc
,
all_levels
,
jb
);
});
});
Kokkos
::
fence
();
printf
(
"Time = %f ms
\n\n
"
,
timer
.
seconds
()
*
1000
);
}
void
scenario_8template
(
double
*
array
,
int
nblocks
,
int
nlev
,
int
nproma
)
{
std
::
cout
<<
"scenario 8template: +ACC
CPU:LayoutLeft GPU:LayoutLeft as ICON pointer
view(array, nproma, nlev, nblocks);
d_view(jc, jk, jb)
----- "
<<
std
::
endl
;
std
::
cout
<<
"scenario 8template: +ACC
always LayoutLeft,
view(array, nproma, nlev, nblocks);
parallel: asICON
----- "
<<
std
::
endl
;
using
space_t
=
Kokkos
::
DefaultExecutionSpace
::
memory_space
;
#if defined(DEMO_DEVICE)
Kokkos
::
View
<
double
***
,
Kokkos
::
LayoutLeft
>
d_view
(
array
,
nproma
,
nlev
,
nblocks
);
#else
Kokkos
::
View
<
double
***
,
Kokkos
::
LayoutLeft
,
Kokkos
::
MemoryUnmanaged
>
d_view
(
array
,
nproma
,
nlev
,
nblocks
);
#endif
timer
.
reset
();
outer_loop
(
nblocks
,
outer_lambda
(
const
int
jb
)
{
inner_loop
(
nproma
,
inner_lambda
(
const
int
jc
)
{
for
(
int
jk
=
0
;
jk
<
nlev
;
++
jk
)
{
int
p
=
jb
*
nlev
*
nproma
+
jk
*
nproma
+
jc
;
d_view
(
jc
,
jk
,
jb
)
=
p
;
// printf("%f ", d_view(jb, jk, jc));
}
update_field
(
d_view
,
jc
,
all_levels
,
jb
);
});
});
Kokkos
::
fence
();
printf
(
"Time = %f ms
\n\n
"
,
timer
.
seconds
()
*
1000
);
}
void
openacc_calls
(
double
*
array
,
int
nblocks
,
int
nlev
,
int
nproma
,
void
openacc_calls
(
double
*
array
,
double
*
ref
,
int
nblocks
,
int
nlev
,
int
nproma
,
std
::
function
<
void
(
double
*
,
int
,
int
,
int
)
>
func
){
#if defined(DEMO_DEVICE)
#if defined(DEMO_DEVICE)
#pragma acc enter data copyin(array[0:nblocks*nlev*nproma])
{
#pragma acc host_data use_device(array)
...
...
@@ -604,64 +616,123 @@ void openacc_calls(double* array, int nblocks, int nlev, int nproma,
}
}
#pragma acc exit data copyout(array[0:nblocks*nlev*nproma])
#else
#else
func
(
array
,
nblocks
,
nlev
,
nproma
);
#endif
validate
(
array
,
nblocks
,
nlev
,
nproma
);
#endif
validate
(
array
,
ref
,
nblocks
,
nlev
,
nproma
);
}
void
init_array
(
double
*
array
,
int
nproma
,
int
nlev
,
int
nblocks
)
{
Kokkos
::
View
<
double
***
,
Kokkos
::
LayoutLeft
,
Kokkos
::
HostSpace
,
Kokkos
::
MemoryUnmanaged
>
view
(
array
,
nproma
,
nlev
,
nblocks
);
for
(
int
jb
=
0
;
jb
<
nblocks
;
++
jb
)
{
for
(
int
jk
=
0
;
jk
<
nlev
;
++
jk
)
{
for
(
int
jc
=
0
;
jc
<
nproma
;
++
jc
)
{
view
(
jc
,
jk
,
jb
)
=
jc
+
jk
*
nproma
+
jb
*
nproma
*
nblocks
;
}
}
}
}
void
physop
(
double
*
array
,
int
nproma
,
int
nlev
,
int
nblocks
)
{
Kokkos
::
View
<
double
***
,
Kokkos
::
LayoutLeft
,
Kokkos
::
HostSpace
,
Kokkos
::
MemoryUnmanaged
>
view
(
array
,
nproma
,
nlev
,
nblocks
);
for
(
int
jb
=
0
;
jb
<
nblocks
;
++
jb
)
{
for
(
int
jc
=
0
;
jc
<
nproma
;
++
jc
)
{
update_field
(
view
,
jc
,
all_levels
,
jb
);
}
}
#if 0
for (int jb=0; jb < nblocks; ++jb) {
for (int jc=0; jc < nproma; ++jc) {
for (int jk=0; jk < nlev; ++jk) {
printf("physop: jc=%d, jk=%d, jk=%d, view=%f\n",jc,jk,jb,view(jc,jk,jb));
}
}
}
#endif
}
int
main
()
{
int
ncells
=
atoi
(
std
::
getenv
(
"NCELLS"
));
int64_t
ncells64
=
atoi
(
std
::
getenv
(
"NCELLS"
));
assert
(
ncells64
<
INT_MAX
);
int64_t
nproma64
=
atoi
(
std
::
getenv
(
"NPROMA"
));
assert
(
nproma64
<
INT_MAX
);
int
ncells
=
ncells64
;
int
nlev
=
atoi
(
std
::
getenv
(
"NLEV"
));
int
nproma
=
atoi
(
std
::
getenv
(
"NPROMA"
))
;
int
nproma
=
nproma64
;
int
nblocks
=
(
ncells
-
1
)
/
nproma
+
1
;
assert
(
nproma
*
nblocks
==
ncells
);
std
::
cout
<<
"nblocks="
<<
nblocks
<<
", nlev="
<<
nlev
<<
", nproma="
<<
nproma
<<
std
::
endl
;
double
array_out_ref
[
nblocks
*
nlev
*
nproma
];
double
array_in_ref
[
nblocks
*
nlev
*
nproma
];
double
array
[
nblocks
*
nlev
*
nproma
];
const
size_t
nbytes
=
nproma
*
nlev
*
nblocks
*
sizeof
(
double
);
Kokkos
::
initialize
();
{
init_array
(
array_in_ref
,
nproma
,
nlev
,
nblocks
);
memcpy
(
array_out_ref
,
array_in_ref
,
nbytes
);
physop
(
array_out_ref
,
nproma
,
nlev
,
nblocks
);
memcpy
(
array
,
array_in_ref
,
nbytes
);
scenario_1
(
array
,
nblocks
,
nlev
,
nproma
,
false
);
//memcpy(array, array_in_ref, nbytes);
//scenario_1(array, nblocks, nlev, nproma);
memset
(
array
,
0.0
,
sizeof
(
array
));
//validate(array, array_out_ref, nblocks, nlev, nproma);
memcpy
(
array
,
array_in_ref
,
nbytes
);
std
::
function
<
void
(
double
*
,
int
,
int
,
int
)
>
s_1b
=
scenario_1b
;
openacc_calls
(
array
,
nblocks
,
nlev
,
nproma
,
s_1b
);
memset
(
array
,
0.0
,
sizeof
(
array
));
openacc_calls
(
array
,
array_out_ref
,
nblocks
,
nlev
,
nproma
,
s_1b
);
#if 0
memcpy(array, array_in_ref, nbytes);
std::function<void(double*, int, int, int)> s_1c = scenario_1c;
openacc_calls
(
array
,
nblocks
,
nlev
,
nproma
,
s_1c
);
openacc_calls(array, array_out_ref, nblocks, nlev, nproma, s_1c);
#endif
memset
(
array
,
0.0
,
sizeof
(
array
));
#if 0
std::function<void(double*, int, int, int)> s_1cmacro = scenario_1cmacro;
openacc_calls(array, nblocks, nlev, nproma, s_1cmacro);
memset(array, 0.0, sizeof(array));
#endif
std
::
function
<
void
(
double
*
,
int
,
int
,
int
)
>
s_7b
=
scenario_7b
;
openacc_calls
(
array
,
nblocks
,
nlev
,
nproma
,
s_7b
);
openacc_calls
(
array
,
array_out_ref
,
nblocks
,
nlev
,
nproma
,
s_7b
);
mem
set
(
array
,
0.0
,
sizeof
(
array
)
);
mem
cpy
(
array
,
array_in_ref
,
nbytes
);
std
::
function
<
void
(
double
*
,
int
,
int
,
int
)
>
s_7bmacro
=
scenario_7bmacro
;
openacc_calls
(
array
,
nblocks
,
nlev
,
nproma
,
s_7bmacro
);
openacc_calls
(
array
,
array_out_ref
,
nblocks
,
nlev
,
nproma
,
s_7bmacro
);
mem
set
(
array
,
0.0
,
sizeof
(
array
)
);
mem
cpy
(
array
,
array_in_ref
,
nbytes
);
std
::
function
<
void
(
double
*
,
int
,
int
,
int
)
>
s_7btemplate
=
scenario_7btemplate
;
openacc_calls
(
array
,
nblocks
,
nlev
,
nproma
,
s_7btemplate
);
openacc_calls
(
array
,
array_out_ref
,
nblocks
,
nlev
,
nproma
,
s_7btemplate
);
memset
(
array
,
0.0
,
sizeof
(
array
));
memcpy
(
array
,
array_in_ref
,
nbytes
);
std
::
function
<
void
(
double
*
,
int
,
int
,
int
)
>
s_8
=
scenario_8
;
openacc_calls
(
array
,
nblocks
,
nlev
,
nproma
,
s_8
);
openacc_calls
(
array
,
array_out_ref
,
nblocks
,
nlev
,
nproma
,
s_8
);
mem
set
(
array
,
0.0
,
sizeof
(
array
)
);
mem
cpy
(
array
,
array_in_ref
,
nbytes
);
std
::
function
<
void
(
double
*
,
int
,
int
,
int
)
>
s_8macro
=
scenario_8macro
;
openacc_calls
(
array
,
nblocks
,
nlev
,
nproma
,
s_8macro
);
openacc_calls
(
array
,
array_out_ref
,
nblocks
,
nlev
,
nproma
,
s_8macro
);
mem
set
(
array
,
0.0
,
sizeof
(
array
)
);
mem
cpy
(
array
,
array_in_ref
,
nbytes
);
std
::
function
<
void
(
double
*
,
int
,
int
,
int
)
>
s_8template
=
scenario_8template
;
openacc_calls
(
array
,
nblocks
,
nlev
,
nproma
,
s_8template
);
openacc_calls
(
array
,
array_out_ref
,
nblocks
,
nlev
,
nproma
,
s_8template
);
// exit(0);
/*
scenario_2(array, nblocks, nlev, nproma);
scenario_2b(array, nblocks, nlev, nproma);
...
...
@@ -670,7 +741,9 @@ int main() {
scenario_4b(array, nblocks, nlev, nproma);
scenario_5(array, nblocks, nlev, nproma);
scenario_6(array, nblocks, nlev, nproma);
scenario_7(array, nblocks, nlev, nproma);
validate(array, array_out_ref, nblocks, nlev, nproma);
*/
}
...
...
This diff is collapsed.
Click to expand it.
script.sh
View file @
047f3330
...
...
@@ -14,15 +14,15 @@ then
cmake
-B
build_gpu
-S
.
-DMU_ARCH
=
a100
-DCMAKE_CXX_FLAGS
=
"-O3"
cmake
--build
build_gpu
--parallel
ncells
=(
50000
00
)
ncells
=(
50000
64
)
nlev
=(
90
)
nproma
=(
50000
00
)
nproma
=(
50000
64
)
else
#rm -rf build
cmake
-B
build
-S
.
-DMU_ARCH
=
x86_64
-DCMAKE_CXX_FLAGS
=
"-O3"
cmake
--build
build
--parallel
ncells
=(
50000
00
)
ncells
=(
50000
64
)
nlev
=(
90
)
nproma
=(
32 64 96 128
)
...
...
@@ -42,6 +42,7 @@ for jb in ${ncells[*]}; do
./build_gpu/demo
else
./build/demo
echo
"---"
fi
done
done
...
...
This diff is collapsed.
Click to expand it.
set_env.sh
0 → 100644
View file @
047f3330
#source this
module load gcc nvhpc
LD_LIBRARY_PATH
=
/sw/spack-levante/gcc-11.2.0-bcn7mb/lib/gcc/x86_64-pc-linux-gnu/11.2.0/../../../../lib64/:
$LD_LIBRARY_PATH
This diff is collapsed.
Click to expand it.