Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
C
cdo
Manage
Activity
Members
Labels
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Model registry
Analyze
Contributor analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
mpim-sw
cdo
Commits
da7835cb
Commit
da7835cb
authored
2 years ago
by
Oliver Heidmann
Browse files
Options
Downloads
Patches
Plain Diff
progress on cufft implementation
parent
72d3f608
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
src/Filter.cc
+57
-66
57 additions, 66 deletions
src/Filter.cc
src/cdo_options.cc
+4
-1
4 additions, 1 deletion
src/cdo_options.cc
src/cdo_options.h
+1
-0
1 addition, 0 deletions
src/cdo_options.h
src/cufft.cu
+96
-41
96 additions, 41 deletions
src/cufft.cu
with
158 additions
and
108 deletions
src/Filter.cc
+
57
−
66
View file @
da7835cb
...
...
@@ -35,14 +35,12 @@
// Forward declaration of wrapper functions that will call CUFFT
// extern void sLaunchCUFFT(float *d_data, int n, void *stream);
extern
void
sPlan1dCUFFT
(
int
fftsize
,
void
*
stream
);
extern
void
sExecCUFFT
(
float
*
sarray
);
extern
void
sDestroyCUFFT
();
// extern void dLaunchCUFFT(double *d_data, int n, void *stream);
extern
void
dPlan1dCUFFT
(
int
fftsize
,
void
*
stream
);
extern
void
dExecCUFFT
(
double
*
darray
);
extern
void
dDestroyCUFFT
();
int
cdo_cufft_init
(
size_t
num_time_steps
,
size_t
batch_size
);
int
cdo_cufft_gpu_routine
(
int
*
fmasc
,
size_t
nts
,
size_t
batch_size
);
int
cdo_cufft_free
();
int
cdo_set_gpu_mem
(
float
value
,
size_t
nts
,
size_t
t
,
size_t
batchID
);
float
get_cufft_filter_result
(
size_t
batchID
,
size_t
nts
,
size_t
t
);
bool
CUFFT_FILTER
=
false
;
static
void
create_fmasc
(
int
nts
,
double
fdata
,
double
fmin
,
double
fmax
,
std
::
vector
<
int
>
&
fmasc
)
{
...
...
@@ -161,86 +159,75 @@ cdo_ff(std::vector<FourierMemory> &fourierMemory, int nts, VarList &varList, int
}
}
int
int
cdo_cufft
(
int
nts
,
VarList
&
varList
,
int
nvars
,
FieldVector3D
&
vars
,
std
::
vector
<
int
>
&
fmasc
)
{
#define HAVE_CUFFT
#ifdef HAVE_CUFFT
CdoOptions
::
cufft_filter_batchsize
=
1024
;
// tbd .... depends on number of timesteps and gpu memor size /availability (have fun future me...)
size_t
batch_size
=
CdoOptions
::
cufft_filter_batchsize
;
cufftReal
*
cufft_mem_in
;
cufftComplex
*
cufft_mem_out
;
cufftHandle
cufftHandle_T2S
;
cufftHandle
cufftHandle_S2T
;
//?? time to spectral and spectral to time ????
cufftPlan1d
(
num_time_steps
,
cufft_mem_in
,
cufft_mem_out
,
1
,
FFTW_ESTIMATE
);
cufftPlan1d
(
num_time_steps
,
cufft_mem_out
,
cufft_mem_in
,
-
1
,
FFTW_ESTIMATE
);
// Test if worked => cuda return code
size_t
batch_size
=
1
;
// Options::cufft_filter_batchsize;
cudaMalloc
((
void
**
)
&
cufft_mem_in
,
sizeof
(
cufftReal
)
*
num_time_steps
*
batch_size
);
cudaMalloc
((
void
**
)
&
cufft_mem_out
,
sizeof
(
cufftComplex
)
*
num_time_steps
*
batch_size
);
int
success
=
cdo_cufft_init
(
nts
,
batch_size
);
if
(
success
!=
0
){
return
success
;
}
else
if
(
true
){
Debug
(
CUFFT_FILTER
,
"cdo CUFFT INIT SUCCESS"
);
Debug
(
CUFFT_FILTER
,
"Size: %d"
,
nts
*
batch_size
);
//Debug("cdo CUFFT INIT SUCCESS");
}
for
(
int
varID
=
0
;
varID
<
nvars
;
++
varID
)
for
(
int
varID
=
0
;
varID
<
nvars
;
++
varID
)
{
auto
fieldMemType
=
varList
[
varID
].
memType
;
const
auto
gridsize
=
varList
[
varID
].
gridsize
;
for
(
int
levelID
=
0
;
levelID
<
varList
[
varID
].
nlevels
;
++
levelID
)
auto
fieldMemType
=
varList
[
varID
].
memType
;
const
auto
gridsize
=
varList
[
varID
].
gridsize
;
for
(
int
levelID
=
0
;
levelID
<
varList
[
varID
].
nlevels
;
++
levelID
)
{
for
(
size_t
i
=
0
;
i
<
gridsize
;
i
+=
batch_size
)
for
(
size_t
i
=
0
;
i
<
gridsize
;
i
+=
batch_size
)
{
// prepare date for gpu
//-----------------------------------
for
(
size_t
batchID
=
0
;
batchID
<
batch_size
;
batchID
++
)
// prepare date for gpu
//-----------------------------------
for
(
size_t
batchID
=
0
;
batchID
<
batch_size
;
batchID
++
)
{
for
(
int
t
=
0
;
t
<
nts
;
++
t
)
for
(
int
t
=
0
;
t
<
nts
&&
i
+
batchID
<
gridsize
;
++
t
)
{
cufft_mem_in
[(
batchID
*
nts
)
+
t
]
=
vars
[
t
][
varID
][
levelID
].
vec_f
[
i
+
batchID
];
//Debug(CUFFT_FILTER,"accessing %d", (nts * batchID) + t);
cdo_set_gpu_mem
(
vars
[
t
][
varID
][
levelID
].
vec_f
[(
i
*
batch_size
)
+
batchID
],
nts
,
t
,
batchID
);
}
}
// gpu calculations
//-----------------------------------
cufftExecR2C
(
cufftHandle_T2S
,
cufft_mem_in
,
cufft_mem_out
);
for
(
size_t
batchID
=
0
;
batchID
<
batch_size
;
batchID
++
)
{
for
(
int
i
=
0
;
i
<
nts
;
++
i
)
if
(
!
fmasc
[
i
])
{
cufft_mem_out
[(
batchID
*
nts
)
+
i
][
0
]
=
0.0
;
cufft_mem_out
[(
batchID
*
nts
)
+
i
][
1
]
=
0.0
;
}
// gpu calculations
//-----------------------------------
auto
success
=
cdo_cufft_gpu_routine
(
fmasc
.
data
(),
nts
,
batch_size
);
if
(
success
!=
0
){
cdo_print
(
"ERROR in cufft gpu routine, err code = %d"
,
success
);
}
cufftExecC2R
(
cufftHandle_T2S
,
cufft_mem_out
,
cufft_mem_in
);
// retransform
//-----------------------------------
for
(
size_t
batchID
=
0
;
batchID
<
batch_size
;
batchID
++
)
// retransform
//
//-----------------------------------
for
(
size_t
outBatchID
=
0
;
outBatchID
<
batch_size
;
outBatchID
++
)
{
for
(
int
t
=
0
;
t
<
nts
;
++
t
)
for
(
int
t
=
0
;
t
<
nts
&&
i
+
outBatchID
<
gridsize
;
++
t
)
{
// divide by nts for normalization, check if cufft does that
vars
[
t
][
varID
][
levelID
].
vec_d
[
i
]
=
cufft_mem_in
[(
batchID
*
nts
)
+
t
]
/
nts
;
Debug
(
CUFFT_FILTER
,
"%d, %d, %d"
,
nts
,
outBatchID
,
t
);
Debug
(
CUFFT_FILTER
,
"accessing %d"
,
(
nts
*
outBatchID
)
+
t
);
Debug
(
CUFFT_FILTER
,
"accessing %d"
,
(
i
*
batch_size
)
+
outBatchID
);
// divide by nts for normalization, check if cufft does that
float
res
=
get_cufft_filter_result
(
outBatchID
,
nts
,
t
);
Debug
(
CUFFT_FILTER
,
"got res"
);
vars
[
t
][
varID
][
levelID
].
vec_f
[(
i
*
batch_size
)
+
outBatchID
]
=
res
/
nts
;
}
}
//-----------------------------------
//-----------------------------------
}
}
}
// todo replace with cufft
cudaFree
(
cufft_mem_in
);
cudaFree
(
cufft_mem_out
);
cufftDestroy
(
cufftHandle_T2S
);
cufftDestroy
(
cufftHandle_S2T
);
return
true
;
cdo_cufft_free
();
return
0
;
#else
cdo_warning
(
"LIBFFTW3 support not compiled in!"
);
cdo_warning
(
"Using intrinsic FFT function!"
);
return
false
;
cdo_warning
(
"LIBFFTW3 support not compiled in!"
);
cdo_warning
(
"Using intrinsic FFT function!"
);
return
-
1
;
#endif
}
...
...
@@ -417,7 +404,6 @@ Filter(void *process)
const
auto
nts
=
tsID
;
if
(
nts
<=
1
)
cdo_abort
(
"Number of time steps <= 1!"
);
std
::
vector
<
FourierMemory
>
fourierMemory
(
Threading
::
ompNumThreads
);
double
fmin
=
0.0
,
fmax
=
0.0
;
switch
(
operfunc
)
...
...
@@ -455,14 +441,19 @@ Filter(void *process)
if
(
Options
::
use_gpu
)
{
auto
success
=
cdo_cufft
(
nts
,
varList
,
nvars
,
vars
,
fmasc
);
if
(
success
!=
0
){
cdo_print
(
"Error in cdo_cufft"
);
return
process
;
}
}
else
if
(
Options
::
Use_FFTW
)
{
std
::
vector
<
FourierMemory
>
fourierMemory
(
Threading
::
ompNumThreads
);
cdo_fftw
(
fourierMemory
,
nts
,
varList
,
nvars
,
vars
,
fmasc
);
}
else
{
std
::
vector
<
FourierMemory
>
fourierMemory
(
Threading
::
ompNumThreads
);
cdo_ff
(
fourierMemory
,
nts
,
varList
,
nvars
,
vars
,
fmasc
);
}
...
...
This diff is collapsed.
Click to expand it.
src/cdo_options.cc
+
4
−
1
View file @
da7835cb
...
...
@@ -28,7 +28,10 @@ bool stderrIsTerminal = false;
namespace
Options
{
bool
use_gpu
=
false
;
// tbd .... depends on number of timesteps and gpu memor size /availability (have fun future me...)
size_t
cufft_filter_batchsize
=
1024
;
int
numStreamWorker
;
long
coresize
=
0
;
int
nsb
=
0
;
// Number of significant bits
...
...
This diff is collapsed.
Click to expand it.
src/cdo_options.h
+
1
−
0
View file @
da7835cb
...
...
@@ -32,6 +32,7 @@ namespace Options
{
//temporary option for gpus (10.3.2021)
extern
bool
use_gpu
;
extern
size_t
cufft_filter_batchsize
;
extern
long
coresize
;
extern
int
numStreamWorker
;
extern
int
nsb
;
// Number of significant bits
...
...
This diff is collapsed.
Click to expand it.
src/cufft.cu
+
96
−
41
View file @
da7835cb
#include
<cufft.h>
#include
<cuda.h>
#include
<cstdio>
cufftHandle
splan
;
cufftHandle
dplan
;
cufftComplex
*
cufft_mem_in
;
cufftComplex
*
cufft_mem_out
;
cufftHandle
cufftHandle_T2S
;
cufftHandle
cufftHandle_S2T
;
int
cdo_cufft_initialized
=
false
;
void
sPlanCUFFT
(
int
n
,
void
*
stream
)
{
cufft
Plan1d
(
&
splan
,
n
,
CUFFT_C2C
,
1
)
;
cufftSetStream
(
splan
,
(
cudaStream_t
)
stream
)
;
int
cdo_set_gpu_mem
(
cufftReal
value
,
size_t
nts
,
size_t
t
,
size_t
batchID
){
cufft_mem_in
[(
batchID
*
nts
)
+
t
].
x
=
value
;
cufft
_mem_in
[(
batchID
*
nts
)
+
t
].
y
=
0.0
;
return
1
;
}
void
sExecCUFFT
(
float
*
sdata
)
{
cufftExecC2C
(
splan
,
(
cufftComplex
*
)
sdata
,
(
cufftComplex
*
)
sdata
,
CUFFT_FORWARD
);
float
get_cufft_filter_result
(
size_t
batchID
,
size_t
nts
,
size_t
t
){
return
cufft_mem_in
[(
batchID
*
nts
)
+
t
].
x
;
}
void
sDestroyCUFFT
()
{
cufftDestroy
(
splan
);
}
int
cdo_cufft_init
(
size_t
num_time_steps
,
size_t
batch_size
){
void
dPlanCUFFT
(
int
n
,
void
*
stream
)
{
cufftPlan1d
(
&
dplan
,
n
,
CUFFT_Z2Z
,
1
);
cufftSetStream
(
dplan
,
(
cudaStream_t
)
stream
);
}
int
success
=
0
;
if
(
cdo_cufft_initialized
==
true
){
success
=
-
77
;
return
success
;
}
//?? time to spectral and spectral to time ????
success
=
cufftPlan1d
(
&
cufftHandle_T2S
,
num_time_steps
,
CUFFT_C2C
,
batch_size
);
if
(
success
!=
CUFFT_SUCCESS
){
printf
(
"error in creating T2S plan in cufft.cu"
);
return
success
;
}
success
=
cufftPlan1d
(
&
cufftHandle_S2T
,
num_time_steps
,
CUFFT_C2C
,
batch_size
);
if
(
success
!=
CUFFT_SUCCESS
){
printf
(
"error in creating S2T plan in cufft.cu"
);
return
success
;
}
// Test if worked => cuda return code
void
dExecCUFFT
(
double
*
ddata
)
{
cufftExecZ2Z
(
dplan
,
(
cufftDoubleComplex
*
)
ddata
,
(
cufftDoubleComplex
*
)
ddata
,
CUFFT_FORWARD
);
}
success
=
cudaMallocManaged
((
void
**
)
&
cufft_mem_in
,
sizeof
(
cufftComplex
)
*
num_time_steps
*
batch_size
);
if
(
success
!=
CUFFT_SUCCESS
){
printf
(
"error allocating input memory in cufft.cu"
);
return
success
;
}
success
=
cudaMallocManaged
((
void
**
)
&
cufft_mem_out
,
sizeof
(
cufftComplex
)
*
num_time_steps
*
batch_size
);
if
(
success
!=
CUFFT_SUCCESS
){
printf
(
"error allocating output memory in cufft.cu"
);
return
success
;
}
void
dDestroyCUFFT
()
{
cufftDestroy
(
dplan
);
if
(
success
==
0
){
cdo_cufft_initialized
=
true
;
}
return
success
;
}
/*
void sLaunchCUFFT(float *sdata, int n, void *stream)
{
cufftHandle plan;
cufftPlan1d(&plan, n, CUFFT_C2C, 1);
cufftSetStream(plan, (cudaStream_t)stream);
cufftExecC2C(plan, (cufftComplex*)sdata, (cufftComplex*)sdata, CUFFT_FORWARD);
cufftDestroy(plan);
int
cdo_cufft_gpu_routine
(
int
*
fmasc
,
size_t
nts
,
size_t
batch_size
){
int
success
=
0
;
success
=
cufftExecC2C
(
cufftHandle_T2S
,
cufft_mem_in
,
cufft_mem_out
,
CUFFT_FORWARD
);
if
(
success
!=
CUFFT_SUCCESS
){
return
success
;
}
for
(
size_t
batchID
=
0
;
batchID
<
batch_size
;
batchID
++
)
{
for
(
size_t
i
=
0
;
i
<
nts
;
++
i
)
if
(
!
fmasc
[
i
])
{
cufft_mem_in
[(
batchID
*
nts
)
+
i
].
x
=
0.0
;
cufft_mem_in
[(
batchID
*
nts
)
+
i
].
y
=
0.0
;
}
}
success
=
cufftExecC2C
(
cufftHandle_S2T
,
cufft_mem_in
,
cufft_mem_out
,
CUFFT_INVERSE
);
if
(
success
!=
CUFFT_SUCCESS
){
return
success
;
}
//TODO error codes
return
CUFFT_SUCCESS
;
}
void dLaunchCUFFT(double *ddata, int n, void *stream)
{
cufftHandle plan;
cufftPlan1d(&plan, n, CUFFT_Z2Z, 1);
cufftSetStream(plan, (cudaStream_t)stream);
cufftExecZ2Z(plan, (cufftDoubleComplex*)ddata, (cufftDoubleComplex*)ddata, CUFFT_FORWARD);
cufftDestroy(plan);
int
cdo_cufft_free
(){
int
success
=
0
;
success
=
cudaFree
(
cufft_mem_in
);
if
(
success
!=
CUDA_SUCCESS
){
printf
(
"Error: could not frree cufft cuda input memory"
);
return
success
;
}
success
=
cudaFree
(
cufft_mem_out
);
if
(
success
!=
CUDA_SUCCESS
){
printf
(
"Error: could not frree cufft cuda output memory"
);
return
success
;
}
success
=
cufftDestroy
(
cufftHandle_T2S
);
if
(
success
!=
CUDA_SUCCESS
){
printf
(
"Error: could not frree cufft cuda plan T2S"
);
return
success
;}
success
=
cufftDestroy
(
cufftHandle_S2T
);
if
(
success
!=
CUDA_SUCCESS
){
printf
(
"Error: could not frree cufft cuda plan S2T"
);
return
success
;}
cdo_cufft_initialized
=
false
;
return
success
;
}
*/
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment