Skip to content
Snippets Groups Projects
Commit da7835cb authored by Oliver Heidmann's avatar Oliver Heidmann
Browse files

progress on cufft implementation

parent 72d3f608
No related branches found
No related tags found
No related merge requests found
......@@ -35,14 +35,12 @@
// Forward declaration of wrapper functions that will call CUFFT
// extern void sLaunchCUFFT(float *d_data, int n, void *stream);
extern void sPlan1dCUFFT(int fftsize, void *stream);
extern void sExecCUFFT(float *sarray);
extern void sDestroyCUFFT();
// extern void dLaunchCUFFT(double *d_data, int n, void *stream);
extern void dPlan1dCUFFT(int fftsize, void *stream);
extern void dExecCUFFT(double *darray);
extern void dDestroyCUFFT();
int cdo_cufft_init(size_t num_time_steps,size_t batch_size);
int cdo_cufft_gpu_routine(int* fmasc,size_t nts, size_t batch_size);
int cdo_cufft_free();
int cdo_set_gpu_mem(float value, size_t nts, size_t t, size_t batchID);
float get_cufft_filter_result(size_t batchID, size_t nts, size_t t);
bool CUFFT_FILTER = false;
static void
create_fmasc(int nts, double fdata, double fmin, double fmax, std::vector<int> &fmasc)
{
......@@ -161,86 +159,75 @@ cdo_ff(std::vector<FourierMemory> &fourierMemory, int nts, VarList &varList, int
}
}
int
int
cdo_cufft(int nts, VarList &varList, int nvars, FieldVector3D &vars, std::vector<int> &fmasc)
{
#define HAVE_CUFFT
#ifdef HAVE_CUFFT
CdoOptions::cufft_filter_batchsize
= 1024; // tbd .... depends on number of timesteps and gpu memor size /availability (have fun future me...)
size_t batch_size = CdoOptions::cufft_filter_batchsize;
cufftReal *cufft_mem_in;
cufftComplex *cufft_mem_out;
cufftHandle cufftHandle_T2S;
cufftHandle cufftHandle_S2T;
//?? time to spectral and spectral to time ????
cufftPlan1d(num_time_steps, cufft_mem_in, cufft_mem_out, 1, FFTW_ESTIMATE);
cufftPlan1d(num_time_steps, cufft_mem_out, cufft_mem_in, -1, FFTW_ESTIMATE);
// Test if worked => cuda return code
size_t batch_size = 1;// Options::cufft_filter_batchsize;
cudaMalloc((void **) &cufft_mem_in, sizeof(cufftReal) * num_time_steps * batch_size);
cudaMalloc((void **) &cufft_mem_out, sizeof(cufftComplex) * num_time_steps * batch_size);
int success = cdo_cufft_init(nts,batch_size);
if(success != 0){
return success;
}
else if(true){
Debug(CUFFT_FILTER,"cdo CUFFT INIT SUCCESS");
Debug(CUFFT_FILTER,"Size: %d", nts * batch_size);
//Debug("cdo CUFFT INIT SUCCESS");
}
for (int varID = 0; varID < nvars; ++varID)
for (int varID = 0; varID < nvars; ++varID)
{
auto fieldMemType = varList[varID].memType;
const auto gridsize = varList[varID].gridsize;
for (int levelID = 0; levelID < varList[varID].nlevels; ++levelID)
auto fieldMemType = varList[varID].memType;
const auto gridsize = varList[varID].gridsize;
for (int levelID = 0; levelID < varList[varID].nlevels; ++levelID)
{
for (size_t i = 0; i < gridsize; i += batch_size)
for (size_t i = 0; i < gridsize; i += batch_size)
{
// prepare date for gpu
//-----------------------------------
for (size_t batchID = 0; batchID < batch_size; batchID++)
// prepare date for gpu
//-----------------------------------
for (size_t batchID = 0; batchID < batch_size; batchID++)
{
for (int t = 0; t < nts; ++t)
for (int t = 0; t < nts && i+batchID < gridsize; ++t)
{
cufft_mem_in[(batchID * nts) + t] = vars[t][varID][levelID].vec_f[i + batchID];
//Debug(CUFFT_FILTER,"accessing %d", (nts * batchID) + t);
cdo_set_gpu_mem(vars[t][varID][levelID].vec_f[(i*batch_size)+batchID],nts,t,batchID);
}
}
// gpu calculations
//-----------------------------------
cufftExecR2C(cufftHandle_T2S, cufft_mem_in, cufft_mem_out);
for (size_t batchID = 0; batchID < batch_size; batchID++)
{
for (int i = 0; i < nts; ++i)
if (!fmasc[i])
{
cufft_mem_out[(batchID * nts) + i][0] = 0.0;
cufft_mem_out[(batchID * nts) + i][1] = 0.0;
}
// gpu calculations
//-----------------------------------
auto success = cdo_cufft_gpu_routine(fmasc.data(),nts,batch_size);
if(success != 0){
cdo_print("ERROR in cufft gpu routine, err code = %d", success);
}
cufftExecC2R(cufftHandle_T2S, cufft_mem_out, cufft_mem_in);
// retransform
//-----------------------------------
for (size_t batchID = 0; batchID < batch_size; batchID++)
// retransform
//
//-----------------------------------
for (size_t outBatchID = 0; outBatchID < batch_size; outBatchID++)
{
for (int t = 0; t < nts; ++t)
for (int t = 0; t < nts && i+outBatchID < gridsize; ++t)
{
// divide by nts for normalization, check if cufft does that
vars[t][varID][levelID].vec_d[i] = cufft_mem_in[(batchID * nts) + t] / nts;
Debug(CUFFT_FILTER,"%d, %d, %d", nts, outBatchID, t);
Debug(CUFFT_FILTER,"accessing %d", (nts * outBatchID) + t);
Debug(CUFFT_FILTER,"accessing %d", (i * batch_size)+outBatchID);
// divide by nts for normalization, check if cufft does that
float res = get_cufft_filter_result(outBatchID,nts,t);
Debug(CUFFT_FILTER,"got res");
vars[t][varID][levelID].vec_f[(i*batch_size) +outBatchID] = res/nts;
}
}
//-----------------------------------
//-----------------------------------
}
}
}
// todo replace with cufft
cudaFree(cufft_mem_in);
cudaFree(cufft_mem_out);
cufftDestroy(cufftHandle_T2S);
cufftDestroy(cufftHandle_S2T);
return true;
cdo_cufft_free();
return 0;
#else
cdo_warning("LIBFFTW3 support not compiled in!");
cdo_warning("Using intrinsic FFT function!");
return false;
cdo_warning("LIBFFTW3 support not compiled in!");
cdo_warning("Using intrinsic FFT function!");
return -1;
#endif
}
......@@ -417,7 +404,6 @@ Filter(void *process)
const auto nts = tsID;
if (nts <= 1) cdo_abort("Number of time steps <= 1!");
std::vector<FourierMemory> fourierMemory(Threading::ompNumThreads);
double fmin = 0.0, fmax = 0.0;
switch (operfunc)
......@@ -455,14 +441,19 @@ Filter(void *process)
if (Options::use_gpu)
{
auto success = cdo_cufft(nts, varList, nvars, vars, fmasc);
if(success != 0){
cdo_print("Error in cdo_cufft");
return process;
}
}
else if (Options::Use_FFTW)
{
std::vector<FourierMemory> fourierMemory(Threading::ompNumThreads);
cdo_fftw(fourierMemory, nts, varList, nvars, vars, fmasc);
}
else
{
std::vector<FourierMemory> fourierMemory(Threading::ompNumThreads);
cdo_ff(fourierMemory, nts, varList, nvars, vars, fmasc);
}
......
......@@ -28,7 +28,10 @@ bool stderrIsTerminal = false;
namespace Options
{
bool use_gpu = false;
// tbd .... depends on number of timesteps and gpu memor size /availability (have fun future me...)
size_t cufft_filter_batchsize = 1024;
int numStreamWorker;
long coresize = 0;
int nsb = 0; // Number of significant bits
......
......@@ -32,6 +32,7 @@ namespace Options
{
//temporary option for gpus (10.3.2021)
extern bool use_gpu;
extern size_t cufft_filter_batchsize;
extern long coresize;
extern int numStreamWorker;
extern int nsb; // Number of significant bits
......
#include <cufft.h>
#include <cuda.h>
#include <cstdio>
cufftHandle splan;
cufftHandle dplan;
cufftComplex *cufft_mem_in;
cufftComplex *cufft_mem_out;
cufftHandle cufftHandle_T2S;
cufftHandle cufftHandle_S2T;
int cdo_cufft_initialized = false;
void sPlanCUFFT(int n, void *stream)
{
cufftPlan1d(&splan, n, CUFFT_C2C, 1);
cufftSetStream(splan, (cudaStream_t)stream);
int cdo_set_gpu_mem(cufftReal value, size_t nts, size_t t, size_t batchID){
cufft_mem_in[(batchID * nts) + t].x = value;
cufft_mem_in[(batchID * nts) + t].y = 0.0;
return 1;
}
void sExecCUFFT(float *sdata)
{
cufftExecC2C(splan, (cufftComplex*)sdata, (cufftComplex*)sdata, CUFFT_FORWARD);
float get_cufft_filter_result(size_t batchID, size_t nts, size_t t){
return cufft_mem_in[(batchID * nts) + t].x;
}
void sDestroyCUFFT()
{
cufftDestroy(splan);
}
int
cdo_cufft_init(size_t num_time_steps, size_t batch_size){
void dPlanCUFFT(int n, void *stream)
{
cufftPlan1d(&dplan, n, CUFFT_Z2Z, 1);
cufftSetStream(dplan, (cudaStream_t)stream);
}
int success = 0;
if(cdo_cufft_initialized == true){
success = -77;
return success;
}
//?? time to spectral and spectral to time ????
success = cufftPlan1d(&cufftHandle_T2S, num_time_steps, CUFFT_C2C, batch_size);
if(success != CUFFT_SUCCESS){
printf("error in creating T2S plan in cufft.cu");
return success;
}
success = cufftPlan1d(&cufftHandle_S2T, num_time_steps, CUFFT_C2C, batch_size);
if(success != CUFFT_SUCCESS){
printf("error in creating S2T plan in cufft.cu");
return success;
}
// Test if worked => cuda return code
void dExecCUFFT(double *ddata)
{
cufftExecZ2Z(dplan, (cufftDoubleComplex*)ddata, (cufftDoubleComplex*)ddata, CUFFT_FORWARD);
}
success = cudaMallocManaged((void **) &cufft_mem_in, sizeof(cufftComplex) * num_time_steps * batch_size);
if(success != CUFFT_SUCCESS){
printf("error allocating input memory in cufft.cu");
return success;
}
success = cudaMallocManaged((void **) &cufft_mem_out, sizeof(cufftComplex) * num_time_steps * batch_size);
if(success != CUFFT_SUCCESS){
printf("error allocating output memory in cufft.cu");
return success;
}
void dDestroyCUFFT()
{
cufftDestroy(dplan);
if(success == 0){
cdo_cufft_initialized = true;
}
return success;
}
/*
void sLaunchCUFFT(float *sdata, int n, void *stream)
{
cufftHandle plan;
cufftPlan1d(&plan, n, CUFFT_C2C, 1);
cufftSetStream(plan, (cudaStream_t)stream);
cufftExecC2C(plan, (cufftComplex*)sdata, (cufftComplex*)sdata, CUFFT_FORWARD);
cufftDestroy(plan);
int cdo_cufft_gpu_routine(int* fmasc,size_t nts, size_t batch_size){
int success = 0;
success = cufftExecC2C(cufftHandle_T2S, cufft_mem_in, cufft_mem_out,CUFFT_FORWARD);
if(success != CUFFT_SUCCESS){
return success;
}
for (size_t batchID = 0; batchID < batch_size; batchID++)
{
for (size_t i = 0; i < nts; ++i)
if (!fmasc[i])
{
cufft_mem_in[(batchID * nts) + i].x = 0.0;
cufft_mem_in[(batchID * nts) + i].y = 0.0;
}
}
success = cufftExecC2C(cufftHandle_S2T, cufft_mem_in, cufft_mem_out,CUFFT_INVERSE);
if(success != CUFFT_SUCCESS){
return success;
}
//TODO error codes
return CUFFT_SUCCESS;
}
void dLaunchCUFFT(double *ddata, int n, void *stream)
{
cufftHandle plan;
cufftPlan1d(&plan, n, CUFFT_Z2Z, 1);
cufftSetStream(plan, (cudaStream_t)stream);
cufftExecZ2Z(plan, (cufftDoubleComplex*)ddata, (cufftDoubleComplex*)ddata, CUFFT_FORWARD);
cufftDestroy(plan);
int cdo_cufft_free(){
int success = 0;
success = cudaFree(cufft_mem_in);
if(success != CUDA_SUCCESS){
printf("Error: could not frree cufft cuda input memory");
return success;
}
success = cudaFree(cufft_mem_out);
if(success != CUDA_SUCCESS){
printf("Error: could not frree cufft cuda output memory");
return success;
}
success = cufftDestroy(cufftHandle_T2S);
if(success != CUDA_SUCCESS){
printf("Error: could not frree cufft cuda plan T2S");
return success;}
success = cufftDestroy(cufftHandle_S2T);
if(success != CUDA_SUCCESS){
printf("Error: could not frree cufft cuda plan S2T");
return success;}
cdo_cufft_initialized = false;
return success;
}
*/
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment