Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
L
libiconmath
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
icon-libraries
libiconmath
Commits
64d639c4
Commit
64d639c4
authored
2 months ago
by
Pradipta Samanta
Browse files
Options
Downloads
Patches
Plain Diff
converted the c++ code in mo_math_utilities to Kokkos
parent
d2ee94aa
No related branches found
Branches containing commit
No related tags found
1 merge request
!37
Draft: C++ port of horizontal/mo_lib_gradients.F90
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/support/mo_math_utilities.cpp
+61
-67
61 additions, 67 deletions
src/support/mo_math_utilities.cpp
with
61 additions
and
67 deletions
src/support/mo_math_utilities.cpp
+
61
−
67
View file @
64d639c4
...
...
@@ -12,75 +12,69 @@
#include
<vector>
#include
<iostream>
#include
<chrono>
// For timing
#include
<Kokkos_Core.hpp>
extern
"C"
{
void
tdma_solver_vec
(
double
*
a
,
double
*
b
,
double
*
c
,
double
*
d
,
int
slev
,
int
elev
,
int
startidx
,
int
endidx
,
int
nrows
,
int
ncols
,
double
*
varout
,
int
opt_acc_queue
=
-
1
)
{
// Start timing
auto
start_time
=
std
::
chrono
::
high_resolution_clock
::
now
();
int
acc_queue
=
(
opt_acc_queue
==
-
1
)
?
1
:
opt_acc_queue
;
// Use 1 as the default if opt_acc_queue is not provided
double
*
cp
=
new
double
[
nrows
*
ncols
];
double
*
dp
=
new
double
[
nrows
*
ncols
];
#define IDX(row, col) ((col) * nrows + (row)) // performs better than lambda function
// OpenACC Parallel Region
#pragma acc parallel default(present) create(cp[:nrows*ncols], dp[:nrows*ncols]) async(acc_queue)
{
// Initialize c-prime and d-prime
#pragma acc loop gang(static: 1) vector
for
(
int
jc
=
startidx
;
jc
<
endidx
;
++
jc
)
{
cp
[
IDX
(
jc
,
slev
)]
=
c
[
IDX
(
jc
,
slev
)]
/
b
[
IDX
(
jc
,
slev
)];
dp
[
IDX
(
jc
,
slev
)]
=
d
[
IDX
(
jc
,
slev
)]
/
b
[
IDX
(
jc
,
slev
)];
}
// Solve for vectors c-prime and d-prime
#pragma acc loop seq
for
(
int
i
=
slev
+
1
;
i
<
elev
;
++
i
)
{
#pragma acc loop gang(static: 1) vector
for
(
int
jc
=
startidx
;
jc
<
endidx
;
++
jc
)
{
double
m
=
1.0
/
(
b
[
IDX
(
jc
,
i
)]
-
cp
[
IDX
(
jc
,
i
-
1
)]
*
a
[
IDX
(
jc
,
i
)]);
cp
[
IDX
(
jc
,
i
)]
=
c
[
IDX
(
jc
,
i
)]
*
m
;
dp
[
IDX
(
jc
,
i
)]
=
(
d
[
IDX
(
jc
,
i
)]
-
dp
[
IDX
(
jc
,
i
-
1
)]
*
a
[
IDX
(
jc
,
i
)])
*
m
;
}
}
// Initialize varout
#pragma acc loop gang(static: 1) vector
for
(
int
jc
=
startidx
;
jc
<
endidx
;
++
jc
)
{
varout
[
IDX
(
jc
,
elev
-
1
)]
=
dp
[
IDX
(
jc
,
elev
-
1
)];
}
// Solve for varout from the vectors c-prime and d-prime
#pragma acc loop seq
for
(
int
i
=
elev
-
2
;
i
>=
slev
;
--
i
)
{
#pragma acc loop gang(static: 1) vector
for
(
int
jc
=
startidx
;
jc
<
endidx
;
++
jc
)
{
varout
[
IDX
(
jc
,
i
)]
=
dp
[
IDX
(
jc
,
i
)]
-
cp
[
IDX
(
jc
,
i
)]
*
varout
[
IDX
(
jc
,
i
+
1
)];
}
}
}
printf
(
"tdma_solver_vec: completed using C++
\n
"
);
// Wait for OpenACC asynchronous operations to complete if acc_queue is not optional
if
(
opt_acc_queue
==
-
1
)
{
#pragma acc wait(acc_queue)
}
// Free memory at the end
delete
[]
cp
;
delete
[]
dp
;
// End timing
auto
end_time
=
std
::
chrono
::
high_resolution_clock
::
now
();
std
::
chrono
::
duration
<
double
>
elapsed_time
=
end_time
-
start_time
;
std
::
cout
<<
"Elapsed time for tdma_solver_vec (C++): "
<<
elapsed_time
.
count
()
<<
" seconds"
<<
std
::
endl
;
void
tdma_solver_vec_kokkos
(
const
double
*
a
,
const
double
*
b
,
const
double
*
c
,
const
double
*
d
,
int
slev
,
int
elev
,
int
startidx
,
int
endidx
,
int
nrows
,
int
ncols
,
double
*
varout
)
{
// Start timing
auto
start_time
=
std
::
chrono
::
high_resolution_clock
::
now
();
// Allocate temporary arrays using Kokkos::View.
// The views c_p and d_p are allocated as 2D arrays with dimensions [nrows][ncols].
// Kokkos::View automatically handles memory management.
Kokkos
::
View
<
double
**>
c_p
(
"c_p"
,
nrows
,
ncols
);
Kokkos
::
View
<
double
**>
d_p
(
"d_p"
,
nrows
,
ncols
);
// Wrap the input arrays in unmanaged views.
// We assume that the input arrays are laid out in column-major order as in the original code.
// Here we use LayoutLeft so that the first index (row) is contiguous.
typedef
Kokkos
::
View
<
const
double
**
,
Kokkos
::
LayoutLeft
,
Kokkos
::
MemoryUnmanaged
>
UnmanagedConst2D
;
typedef
Kokkos
::
View
<
double
**
,
Kokkos
::
LayoutLeft
,
Kokkos
::
MemoryUnmanaged
>
Unmanaged2D
;
UnmanagedConst2D
a_view
(
a
,
nrows
,
ncols
);
UnmanagedConst2D
b_view
(
b
,
nrows
,
ncols
);
UnmanagedConst2D
c_view
(
c
,
nrows
,
ncols
);
UnmanagedConst2D
d_view
(
d
,
nrows
,
ncols
);
Unmanaged2D
varout_view
(
varout
,
nrows
,
ncols
);
// Initialize c-prime and d-prime at the starting level (slev)
Kokkos
::
parallel_for
(
"init_c_p_d_p"
,
Kokkos
::
RangePolicy
<>
(
startidx
,
endidx
),
KOKKOS_LAMBDA
(
const
int
jc
)
{
c_p
(
jc
,
slev
)
=
c_view
(
jc
,
slev
)
/
b_view
(
jc
,
slev
);
d_p
(
jc
,
slev
)
=
d_view
(
jc
,
slev
)
/
b_view
(
jc
,
slev
);
});
Kokkos
::
fence
();
// Forward sweep: compute c-prime and d-prime for each column from slev+1 to elev-1.
for
(
int
i
=
slev
+
1
;
i
<
elev
;
++
i
)
{
Kokkos
::
parallel_for
(
"forward_sweep"
,
Kokkos
::
RangePolicy
<>
(
startidx
,
endidx
),
KOKKOS_LAMBDA
(
const
int
jc
)
{
double
m
=
1.0
/
(
b_view
(
jc
,
i
)
-
c_p
(
jc
,
i
-
1
)
*
a_view
(
jc
,
i
));
c_p
(
jc
,
i
)
=
c_view
(
jc
,
i
)
*
m
;
d_p
(
jc
,
i
)
=
(
d_view
(
jc
,
i
)
-
d_p
(
jc
,
i
-
1
)
*
a_view
(
jc
,
i
))
*
m
;
});
Kokkos
::
fence
();
}
// Initialize the output array at the last level (elev-1)
Kokkos
::
parallel_for
(
"init_varout"
,
Kokkos
::
RangePolicy
<>
(
startidx
,
endidx
),
KOKKOS_LAMBDA
(
const
int
jc
)
{
varout_view
(
jc
,
elev
-
1
)
=
d_p
(
jc
,
elev
-
1
);
});
Kokkos
::
fence
();
// Back substitution: update varout for columns from elev-2 down to slev.
for
(
int
i
=
elev
-
2
;
i
>=
slev
;
--
i
)
{
Kokkos
::
parallel_for
(
"back_substitution"
,
Kokkos
::
RangePolicy
<>
(
startidx
,
endidx
),
KOKKOS_LAMBDA
(
const
int
jc
)
{
varout_view
(
jc
,
i
)
=
d_p
(
jc
,
i
)
-
c_p
(
jc
,
i
)
*
varout_view
(
jc
,
i
+
1
);
});
Kokkos
::
fence
();
}
// End timing and print the elapsed time
auto
end_time
=
std
::
chrono
::
high_resolution_clock
::
now
();
std
::
chrono
::
duration
<
double
>
elapsed_time
=
end_time
-
start_time
;
std
::
cout
<<
"Elapsed time for tdma_solver_vec (Kokkos): "
<<
elapsed_time
.
count
()
<<
" seconds"
<<
std
::
endl
;
}
}
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment