diff --git a/ChangeLog b/ChangeLog
index 60d0a39cd565c2e01a08634b16d6a5fcfd979c00..df46739e4e7973131c08dba22b620d9a551f1639 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,15 @@
+2025-02-28  Uwe Schulzweida
+
+	* Version 2.5.1 released
+
+2024-12-28  Uwe Schulzweida
+
+	* NetCDF4: improved calculation of output chunk size
+
+2024-12-23  Uwe Schulzweida
+
+	* NetCDF4: improved calculation of input chunk cache size for numStep>1 and zSize>1
+
 2024-11-28  Uwe Schulzweida
 
         * using CGRIBEX library version 2.3.1
diff --git a/configure.ac b/configure.ac
index caa057f45a4237c314588e343f073fcbedc14504..8855b67b2cd093876f78c5589a76f6d88889e17d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -7,7 +7,7 @@
 AC_PREREQ([2.69])
 LT_PREREQ([2.4.6])
 
-AC_INIT([cdi],[2.5.0],[https://mpimet.mpg.de/cdi])
+AC_INIT([cdi],[2.5.1],[https://mpimet.mpg.de/cdi])
 AC_DEFINE_UNQUOTED(CDI, ["$PACKAGE_VERSION"], [CDI version])
 
 AC_CONFIG_AUX_DIR([config])
diff --git a/src/cdf_write.c b/src/cdf_write.c
index eb556afcac38904a2ddbc11224283458fb6d6c16..1e73c949e05de87536b0ae001f9e45fcaa4d1463 100644
--- a/src/cdf_write.c
+++ b/src/cdf_write.c
@@ -702,19 +702,41 @@ calc_chunksize(size_t chunkSizeLim, size_t size)
 static const size_t chunkSizeMin = 262144;    // 256k
 static const size_t chunkSizeLim = 16777216;  // 16m
 
+size_t
+auto_chunksize_y(size_t gridsize, size_t xsize, size_t ysize)
+{
+  size_t gridChunkSize = calc_chunksize(chunkSizeLim, gridsize);
+  size_t yChunkSize = gridChunkSize / xsize;
+  size_t numChunks = (ysize % yChunkSize == 0) ? (ysize / yChunkSize) : (ysize / yChunkSize) + 1;
+  size_t modChunk = ysize % numChunks;
+  size_t maxChunks = numChunks * 2;
+  for (size_t i = numChunks; i < maxChunks; ++i)
+    {
+      size_t imod = ysize % i;
+      if (imod == 0 || imod > modChunk)
+        {
+          numChunks = i;
+          modChunk = imod;
+          if (modChunk == 0) break;
+        }
+    }
+  yChunkSize = ysize / numChunks;
+  return yChunkSize;
+}
+
 size_t
 calc_chunksize_y(int chunkType, size_t gridsize, size_t xsize, size_t ysize)
 {
   if (chunkType == CDI_CHUNK_AUTO)
-    return (gridsize <= chunkSizeMin) ? ysize : chunkSizeMin / xsize;
+    return auto_chunksize_y(gridsize, xsize, ysize);
   else
     return (chunkType == CDI_CHUNK_LINES) ? 1 : ysize;
 }
 
 size_t
-calc_chunksize_x(int chunkType, long chunkSize, size_t xsize, bool yIsUndefined)
+calc_chunksize_x(int chunkType, long chunkSize, size_t xsize, bool isReg2dGrid)
 {
-  if (chunkType == CDI_CHUNK_AUTO && yIsUndefined)
+  if (chunkType == CDI_CHUNK_AUTO && !isReg2dGrid)
     return (chunkSize > 0 && (size_t) chunkSize < xsize) ? (size_t) chunkSize : ((xsize <= chunkSizeMin) ? xsize : chunkSizeMin);
   else
     return calc_chunksize(chunkSizeLim, xsize);
@@ -753,9 +775,10 @@ cdfDefineDimsAndChunks(const stream_t *streamptr, int varID, int xid, int yid, i
   cdiInqKeyInt(vlistID, varID, CDI_KEY_CHUNKSIZE, &chunkSize);
   if (chunkSize > 0 && yid == CDI_UNDEFID) chunkType = CDI_CHUNK_AUTO;
 
-  if (chunkType == CDI_CHUNK_GRID && gridsize > ChunkSizeLim)
+  bool isReg2dGrid = (yid != CDI_UNDEFID && xid != CDI_UNDEFID);
+  if (chunkType == CDI_CHUNK_GRID && gridsize > ChunkSizeLim && isReg2dGrid)
     {
-      if (CDI_Debug) fprintf(stderr, "gridsize > %u, changed chunkType to CDI_CHUNK_LINES!\n", ChunkSizeLim);
+      if (CDI_Debug) fprintf(stderr, "gridsize > %u, changed chunkType to CDI_CHUNK_AUTO!\n", ChunkSizeLim);
       chunkType = CDI_CHUNK_AUTO;
     }
 
@@ -776,7 +799,7 @@ cdfDefineDimsAndChunks(const stream_t *streamptr, int varID, int xid, int yid, i
         }
       else if (dimorder[id] == 1 && xid != CDI_UNDEFID)
         {
-          chunks[ndims] = calc_chunksize_x(chunkType, chunkSize, xsize, (yid == CDI_UNDEFID));
+          chunks[ndims] = calc_chunksize_x(chunkType, chunkSize, xsize, isReg2dGrid);
           dims[ndims] = xid;
           ndims++;
         }
diff --git a/src/stream_cdf_i.c b/src/stream_cdf_i.c
index 9b7565a012bdd95b2b93a32f63d07e6db6648768..cf4f187dfe42277967856cc6594cc929a9c64d37 100644
--- a/src/stream_cdf_i.c
+++ b/src/stream_cdf_i.c
@@ -115,6 +115,7 @@ typedef struct
   size_t gridSize;
   size_t xSize;
   size_t ySize;
+  size_t zSize;
   int nattsNC;
   int natts;
   int *atts;
@@ -649,6 +650,7 @@ init_ncvars(int nvars, ncvar_t *ncvars, int ncid)
       ncvar->gridSize = 0;
       ncvar->xSize = 0;
       ncvar->ySize = 0;
+      ncvar->zSize = 0;
       ncvar->nattsNC = 0;
       ncvar->natts = 0;
       ncvar->atts = NULL;
@@ -2115,7 +2117,7 @@ cdf_load_vals(size_t size, int ndims, int varid, ncvar_t *ncvar, double **gridva
 {
   if (CDI_Netcdf_Lazy_Grid_Load)
     {
-      *valsGet = (struct xyValGet){
+      *valsGet = (struct xyValGet) {
         .scalefactor = ncvar->scalefactor,
         .addoffset = ncvar->addoffset,
         .start = { start[0], start[1], start[2] },
@@ -3261,12 +3263,12 @@ cdf_define_all_zaxes(stream_t *streamptr, int vlistID, ncdim_t *ncdims, int nvar
 	          if      (ncvars[zvarid].xtype == NC_FLOAT) zdatatype = CDI_DATATYPE_FLT32;
 	          else if (ncvars[zvarid].xtype == NC_INT)   zdatatype = CDI_DATATYPE_INT32;
 	          else if (ncvars[zvarid].xtype == NC_SHORT) zdatatype = CDI_DATATYPE_INT16;
-                // clang-format on
-                // don't change the name !!!
-                /*
-                  if ((len = strlen(pname)) > 2)
-                  if (pname[len-2] == '_' && isdigit((int) pname[len-1])) pname[len-2] = 0;
-                */
+              // clang-format on
+              // don't change the name !!!
+              /*
+                if ((len = strlen(pname)) > 2)
+                if (pname[len-2] == '_' && isdigit((int) pname[len-1])) pname[len-2] = 0;
+              */
 #ifndef USE_MPI
               if (zaxisType == ZAXIS_CHAR && ncvars[zvarid].ndims == 2)
                 {
@@ -3339,6 +3341,7 @@ cdf_define_all_zaxes(stream_t *streamptr, int vlistID, ncdim_t *ncdims, int nvar
 
           ncvar->zaxisID = varDefZaxis(vlistID, zaxisType, (int) zsize, zvar, (const char **) zcvals, zclength, lbounds, ubounds,
                                        (int) vctsize, vct, pname, plongname, punits, zdatatype, 1, 0, -1);
+          ncvar->zSize = zsize;
 
           int zaxisID = ncvar->zaxisID;
 
@@ -3559,20 +3562,21 @@ size_of_dim_chunks(size_t n, size_t c)
 static size_t
 calc_chunk_cache_size(int timedimid, ncvar_t *ncvar)
 {
-  size_t nx = 0, ny = 0;
+  size_t nx = 0, ny = 0, nz = 0;
   size_t cx = 0, cy = 0, cz = 0;
   for (int i = 0; i < ncvar->ndims; i++)
     {
       int dimtype = ncvar->dimtypes[i];
       // clang-format off
-      if      (dimtype == Z_AXIS) { cz = ncvar->chunks[i]; }
+      if      (dimtype == Z_AXIS) { cz = ncvar->chunks[i]; nz = ncvar->zSize; }
       else if (dimtype == Y_AXIS) { cy = ncvar->chunks[i]; ny = ncvar->ySize; }
       else if (dimtype == X_AXIS) { cx = ncvar->chunks[i]; nx = ncvar->xSize; }
       // clang-format on
     }
 
-  size_t chunkCacheSize = (ncvar->dimids[0] == timedimid) ? ncvar->chunks[0] : 1;
-  if (cz > 0) chunkCacheSize *= cz;
+  size_t numSteps = (ncvar->dimids[0] == timedimid) ? ncvar->chunks[0] : 1;
+  size_t chunkCacheSize = numSteps;
+  if (nz > 0 && cz > 0) chunkCacheSize *= (numSteps == 1) ? cz : size_of_dim_chunks(nz, cz);
 
   if (chunkCacheSize == 1) return 0;  // no chunk cache needed because the full field is read