progress on cufft implementation

da7835cb · Oliver Heidmann · 72d3f608 · da7835cb · da7835cb · da7835cb
Commit da7835cb authored 2 years ago by Oliver Heidmann
--- a/src/Filter.cc
+++ b/src/Filter.cc
@@ -35,14 +35,12 @@

 // Forward declaration of wrapper functions that will call CUFFT
 // extern void sLaunchCUFFT(float *d_data, int n, void *stream);
-extern void sPlan1dCUFFT(int fftsize, void *stream);
-extern void sExecCUFFT(float *sarray);
-extern void sDestroyCUFFT();
-// extern void dLaunchCUFFT(double *d_data, int n, void *stream);
-extern void dPlan1dCUFFT(int fftsize, void *stream);
-extern void dExecCUFFT(double *darray);
-extern void dDestroyCUFFT();
-
+int cdo_cufft_init(size_t num_time_steps,size_t batch_size);
+int cdo_cufft_gpu_routine(int* fmasc,size_t nts, size_t batch_size);
+int cdo_cufft_free();
+int cdo_set_gpu_mem(float value, size_t nts, size_t t, size_t batchID);
+float get_cufft_filter_result(size_t batchID, size_t nts, size_t t);
+bool CUFFT_FILTER = false;
 static void
 create_fmasc(int nts, double fdata, double fmin, double fmax, std::vector<int> &fmasc)
 {
@@ -161,86 +159,75 @@ cdo_ff(std::vector<FourierMemory> &fourierMemory, int nts, VarList &varList, int
    }
 }

-int
+    int
 cdo_cufft(int nts, VarList &varList, int nvars, FieldVector3D &vars, std::vector<int> &fmasc)
 {
+#define HAVE_CUFFT
 #ifdef HAVE_CUFFT

-  CdoOptions::cufft_filter_batchsize
-      = 1024;  // tbd .... depends on number of timesteps and gpu memor size /availability (have fun future me...)
-  size_t batch_size = CdoOptions::cufft_filter_batchsize;
-
-  cufftReal *cufft_mem_in;
-  cufftComplex *cufft_mem_out;
-  cufftHandle cufftHandle_T2S;
-  cufftHandle cufftHandle_S2T;
-
-  //?? time to spectral and spectral to time ????
-  cufftPlan1d(num_time_steps, cufft_mem_in, cufft_mem_out, 1, FFTW_ESTIMATE);
-  cufftPlan1d(num_time_steps, cufft_mem_out, cufft_mem_in, -1, FFTW_ESTIMATE);
-  // Test if worked => cuda return code
+    size_t batch_size = 1;// Options::cufft_filter_batchsize; 

-  cudaMalloc((void **) &cufft_mem_in, sizeof(cufftReal) * num_time_steps * batch_size);
-  cudaMalloc((void **) &cufft_mem_out, sizeof(cufftComplex) * num_time_steps * batch_size);
+    int success = cdo_cufft_init(nts,batch_size);
+    if(success != 0){
+        return success;
+    }
+    else if(true){
+        Debug(CUFFT_FILTER,"cdo CUFFT INIT SUCCESS");
+        Debug(CUFFT_FILTER,"Size: %d", nts * batch_size);
+        //Debug("cdo CUFFT INIT SUCCESS");
+    }

-  for (int varID = 0; varID < nvars; ++varID)
+    for (int varID = 0; varID < nvars; ++varID)
    {
-      auto fieldMemType = varList[varID].memType;
-      const auto gridsize = varList[varID].gridsize;
-      for (int levelID = 0; levelID < varList[varID].nlevels; ++levelID)
+        auto fieldMemType = varList[varID].memType;
+        const auto gridsize = varList[varID].gridsize;
+        for (int levelID = 0; levelID < varList[varID].nlevels; ++levelID)
        {
-          for (size_t i = 0; i < gridsize; i += batch_size)
+            for (size_t i = 0; i < gridsize; i += batch_size)
            {
-              // prepare date for gpu
-              //-----------------------------------
-              for (size_t batchID = 0; batchID < batch_size; batchID++)
+                // prepare date for gpu
+                //-----------------------------------
+                for (size_t batchID = 0; batchID < batch_size; batchID++)
                {
-                  for (int t = 0; t < nts; ++t)
+                    for (int t = 0; t < nts && i+batchID < gridsize; ++t)
                    {
-                      cufft_mem_in[(batchID * nts) + t] = vars[t][varID][levelID].vec_f[i + batchID];
+                        //Debug(CUFFT_FILTER,"accessing %d", (nts * batchID) + t);
+                        cdo_set_gpu_mem(vars[t][varID][levelID].vec_f[(i*batch_size)+batchID],nts,t,batchID);
                    }
                }

-              // gpu calculations
-              //-----------------------------------
-              cufftExecR2C(cufftHandle_T2S, cufft_mem_in, cufft_mem_out);
-
-              for (size_t batchID = 0; batchID < batch_size; batchID++)
-                {
-                  for (int i = 0; i < nts; ++i)
-                    if (!fmasc[i])
-                      {
-                        cufft_mem_out[(batchID * nts) + i][0] = 0.0;
-                        cufft_mem_out[(batchID * nts) + i][1] = 0.0;
-                      }
+                // gpu calculations
+                //-----------------------------------
+                auto success = cdo_cufft_gpu_routine(fmasc.data(),nts,batch_size);
+                if(success != 0){
+                    cdo_print("ERROR in cufft gpu routine, err code = %d", success);
                }
-
-              cufftExecC2R(cufftHandle_T2S, cufft_mem_out, cufft_mem_in);
-
-              // retransform
-              //-----------------------------------
-              for (size_t batchID = 0; batchID < batch_size; batchID++)
+                // retransform
+                //
+                //-----------------------------------
+                for (size_t outBatchID = 0; outBatchID < batch_size; outBatchID++)
                {
-                  for (int t = 0; t < nts; ++t)
+                    for (int t = 0; t < nts && i+outBatchID < gridsize; ++t)
                    {
-                      // divide by nts for normalization, check if cufft does that
-                      vars[t][varID][levelID].vec_d[i] = cufft_mem_in[(batchID * nts) + t] / nts;
+                        Debug(CUFFT_FILTER,"%d, %d, %d", nts, outBatchID, t);
+                        Debug(CUFFT_FILTER,"accessing %d", (nts * outBatchID) + t);
+                        Debug(CUFFT_FILTER,"accessing %d", (i * batch_size)+outBatchID);
+                        // divide by nts for normalization, check if cufft does that
+                        float res = get_cufft_filter_result(outBatchID,nts,t);
+                        Debug(CUFFT_FILTER,"got res");
+                        vars[t][varID][levelID].vec_f[(i*batch_size) +outBatchID] = res/nts;
                    }
                }
-              //-----------------------------------
+                //-----------------------------------
            }
        }
    }
-  // todo replace with cufft
-  cudaFree(cufft_mem_in);
-  cudaFree(cufft_mem_out);
-  cufftDestroy(cufftHandle_T2S);
-  cufftDestroy(cufftHandle_S2T);
-  return true;
+    cdo_cufft_free();
+    return 0;
 #else
-  cdo_warning("LIBFFTW3 support not compiled in!");
-  cdo_warning("Using intrinsic FFT function!");
-  return false;
+    cdo_warning("LIBFFTW3 support not compiled in!");
+    cdo_warning("Using intrinsic FFT function!");
+    return -1;
 #endif
 }

@@ -417,7 +404,6 @@ Filter(void *process)
  const auto nts = tsID;
  if (nts <= 1) cdo_abort("Number of time steps <= 1!");

-  std::vector<FourierMemory> fourierMemory(Threading::ompNumThreads);

  double fmin = 0.0, fmax = 0.0;
  switch (operfunc)
@@ -455,14 +441,19 @@ Filter(void *process)
  if (Options::use_gpu)
    {
      auto success = cdo_cufft(nts, varList, nvars, vars, fmasc);
+      if(success != 0){
+          cdo_print("Error in cdo_cufft");
      return process;
+      }
    }
  else if (Options::Use_FFTW)
    {
+  std::vector<FourierMemory> fourierMemory(Threading::ompNumThreads);
      cdo_fftw(fourierMemory, nts, varList, nvars, vars, fmasc);
    }
  else
    {
+  std::vector<FourierMemory> fourierMemory(Threading::ompNumThreads);
      cdo_ff(fourierMemory, nts, varList, nvars, vars, fmasc);
    }


--- a/src/cdo_options.cc
+++ b/src/cdo_options.cc
@@ -28,7 +28,10 @@ bool stderrIsTerminal = false;
 namespace Options
 {
 bool use_gpu = false;
- 
+
+// tbd .... depends on number of timesteps and gpu memor size /availability (have fun future me...)
+size_t cufft_filter_batchsize = 1024;
+
 int numStreamWorker;
 long coresize = 0;
 int nsb = 0; // Number of significant bits

--- a/src/cdo_options.h
+++ b/src/cdo_options.h
@@ -32,6 +32,7 @@ namespace Options
 {
 //temporary option for gpus (10.3.2021)
 extern bool use_gpu;
+extern size_t cufft_filter_batchsize;
 extern long coresize;
 extern int numStreamWorker;
 extern int nsb; // Number of significant bits

--- a/src/cufft.cu
+++ b/src/cufft.cu
 #include <cufft.h>
+#include <cuda.h>
+#include <cstdio>

-cufftHandle splan;
-cufftHandle dplan;
+cufftComplex *cufft_mem_in;
+cufftComplex *cufft_mem_out;
+cufftHandle cufftHandle_T2S;
+cufftHandle cufftHandle_S2T;
+int cdo_cufft_initialized = false;

-void sPlanCUFFT(int n, void *stream)
-{
-    cufftPlan1d(&splan, n, CUFFT_C2C, 1);
-    cufftSetStream(splan, (cudaStream_t)stream);
+int cdo_set_gpu_mem(cufftReal value,  size_t nts, size_t t, size_t batchID){
+    cufft_mem_in[(batchID * nts) + t].x = value;
+    cufft_mem_in[(batchID * nts) + t].y = 0.0;
+    return 1;
 }

-void sExecCUFFT(float *sdata)
-{
-    cufftExecC2C(splan, (cufftComplex*)sdata, (cufftComplex*)sdata, CUFFT_FORWARD);
+float get_cufft_filter_result(size_t batchID, size_t nts, size_t t){
+    return  cufft_mem_in[(batchID * nts) + t].x;
 }

-void sDestroyCUFFT()
-{
-    cufftDestroy(splan);
-}
+int
+cdo_cufft_init(size_t num_time_steps, size_t batch_size){

-void dPlanCUFFT(int n, void *stream)
-{
-    cufftPlan1d(&dplan, n, CUFFT_Z2Z, 1);
-    cufftSetStream(dplan, (cudaStream_t)stream);
-}
+    int success = 0;
+    if(cdo_cufft_initialized == true){
+        success = -77;
+        return success;
+    }
+    //?? time to spectral and spectral to time ????
+    success = cufftPlan1d(&cufftHandle_T2S, num_time_steps, CUFFT_C2C, batch_size);
+    if(success != CUFFT_SUCCESS){
+        printf("error in creating T2S plan in cufft.cu");
+        return success;
+    }
+    success = cufftPlan1d(&cufftHandle_S2T, num_time_steps, CUFFT_C2C, batch_size);
+    if(success != CUFFT_SUCCESS){
+        printf("error in creating S2T plan in cufft.cu");
+        return success;
+    }
+    // Test if worked => cuda return code

-void dExecCUFFT(double *ddata)
-{
-    cufftExecZ2Z(dplan, (cufftDoubleComplex*)ddata, (cufftDoubleComplex*)ddata, CUFFT_FORWARD);
-}
+    success  = cudaMallocManaged((void **) &cufft_mem_in, sizeof(cufftComplex) * num_time_steps * batch_size);
+    if(success != CUFFT_SUCCESS){
+        printf("error allocating input memory in cufft.cu");
+        return success;
+    }
+    success  = cudaMallocManaged((void **) &cufft_mem_out, sizeof(cufftComplex) * num_time_steps * batch_size);
+    if(success != CUFFT_SUCCESS){
+        printf("error allocating output memory in cufft.cu");
+        return success;
+    }

-void dDestroyCUFFT()
-{
-    cufftDestroy(dplan);
+    if(success == 0){
+        cdo_cufft_initialized = true;
+    }
+    return success;
 }

-/*
-void sLaunchCUFFT(float *sdata, int n, void *stream)
-{
-    cufftHandle plan;
-    cufftPlan1d(&plan, n, CUFFT_C2C, 1);
-    cufftSetStream(plan, (cudaStream_t)stream);
-    cufftExecC2C(plan, (cufftComplex*)sdata, (cufftComplex*)sdata, CUFFT_FORWARD);
-    cufftDestroy(plan);
+int cdo_cufft_gpu_routine(int* fmasc,size_t nts, size_t batch_size){
+
+    int success = 0;
+    success = cufftExecC2C(cufftHandle_T2S, cufft_mem_in, cufft_mem_out,CUFFT_FORWARD);
+
+    if(success != CUFFT_SUCCESS){
+        return success;
+    }
+
+    for (size_t batchID = 0; batchID < batch_size; batchID++)
+    {
+        for (size_t i = 0; i < nts; ++i)
+            if (!fmasc[i])
+            {
+                cufft_mem_in[(batchID * nts) + i].x = 0.0;
+                cufft_mem_in[(batchID * nts) + i].y = 0.0;
+            }
+    }
+
+    success = cufftExecC2C(cufftHandle_S2T, cufft_mem_in, cufft_mem_out,CUFFT_INVERSE);
+    if(success != CUFFT_SUCCESS){
+        return success;
+    }
+
+    //TODO error codes
+    return CUFFT_SUCCESS;
+
 }

-void dLaunchCUFFT(double *ddata, int n, void *stream)
-{
-    cufftHandle plan;
-    cufftPlan1d(&plan, n, CUFFT_Z2Z, 1);
-    cufftSetStream(plan, (cudaStream_t)stream);
-    cufftExecZ2Z(plan, (cufftDoubleComplex*)ddata, (cufftDoubleComplex*)ddata, CUFFT_FORWARD);
-    cufftDestroy(plan);
+int cdo_cufft_free(){
+    int success = 0;
+    success = cudaFree(cufft_mem_in);
+    if(success != CUDA_SUCCESS){
+        printf("Error: could not frree cufft cuda input memory");
+        return success;
+    }
+    success = cudaFree(cufft_mem_out);
+    if(success != CUDA_SUCCESS){
+        printf("Error: could not frree cufft cuda output memory");
+        return success;
+    }
+    success = cufftDestroy(cufftHandle_T2S);
+    if(success != CUDA_SUCCESS){
+        printf("Error: could not frree cufft cuda plan T2S");
+        return success;}
+    success  = cufftDestroy(cufftHandle_S2T);
+    if(success != CUDA_SUCCESS){
+        printf("Error: could not frree cufft cuda plan S2T");
+        return success;}
+    cdo_cufft_initialized = false;
+    return success;
 }
-*/
+