From 7611d377401467537fa8273bcd0b024df41cca14 Mon Sep 17 00:00:00 2001
From: Thomas Jahns <jahns@dkrz.de>
Date: Mon, 22 Aug 2022 14:22:07 +0200
Subject: [PATCH] Add alternative code path for huge buffers.

* In practice indexing into RMA buffers may require 64bit addressing,
  even though individual data sets are still limited to INT_MAX
  entries.
---
 config/config.guess |  34 +++++++---
 config/ltmain.sh    |  25 ++++---
 configure           |  83 +++++++++++++++++++++++-
 configure.ac        |  16 +++--
 src/config.h.in     |   9 +++
 src/pio_server.c    | 154 +++++++++++++++++++++++++++++++-------------
 6 files changed, 246 insertions(+), 75 deletions(-)

diff --git a/config/config.guess b/config/config.guess
index 7f76b6228..1817bdce9 100755
--- a/config/config.guess
+++ b/config/config.guess
@@ -4,7 +4,7 @@
 
 # shellcheck disable=SC2006,SC2268 # see below for rationale
 
-timestamp='2022-01-09'
+timestamp='2022-05-25'
 
 # This file is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License as published by
@@ -1151,16 +1151,27 @@ EOF
 	;;
     x86_64:Linux:*:*)
 	set_cc_for_build
+	CPU=$UNAME_MACHINE
 	LIBCABI=$LIBC
 	if test "$CC_FOR_BUILD" != no_compiler_found; then
-	    if (echo '#ifdef __ILP32__'; echo IS_X32; echo '#endif') | \
-		(CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \
-		grep IS_X32 >/dev/null
-	    then
-		LIBCABI=${LIBC}x32
-	    fi
+	    ABI=64
+	    sed 's/^	    //' << EOF > "$dummy.c"
+	    #ifdef __i386__
+	    ABI=x86
+	    #else
+	    #ifdef __ILP32__
+	    ABI=x32
+	    #endif
+	    #endif
+EOF
+	    cc_set_abi=`$CC_FOR_BUILD -E "$dummy.c" 2>/dev/null | grep '^ABI' | sed 's, ,,g'`
+	    eval "$cc_set_abi"
+	    case $ABI in
+		x86) CPU=i686 ;;
+		x32) LIBCABI=${LIBC}x32 ;;
+	    esac
 	fi
-	GUESS=$UNAME_MACHINE-pc-linux-$LIBCABI
+	GUESS=$CPU-pc-linux-$LIBCABI
 	;;
     xtensa*:Linux:*:*)
 	GUESS=$UNAME_MACHINE-unknown-linux-$LIBC
@@ -1367,8 +1378,11 @@ EOF
     BePC:Haiku:*:*)	# Haiku running on Intel PC compatible.
 	GUESS=i586-pc-haiku
 	;;
-    x86_64:Haiku:*:*)
-	GUESS=x86_64-unknown-haiku
+    ppc:Haiku:*:*)	# Haiku running on Apple PowerPC
+	GUESS=powerpc-apple-haiku
+	;;
+    *:Haiku:*:*)	# Haiku modern gcc (not bound by BeOS compat)
+	GUESS=$UNAME_MACHINE-unknown-haiku
 	;;
     SX-4:SUPER-UX:*:*)
 	GUESS=sx4-nec-superux$UNAME_RELEASE
diff --git a/config/ltmain.sh b/config/ltmain.sh
index 9cf2bd7bc..5f5e7a13c 100644
--- a/config/ltmain.sh
+++ b/config/ltmain.sh
@@ -7646,10 +7646,7 @@ func_mode_link ()
 	case $pass in
 	dlopen) libs=$dlfiles ;;
 	dlpreopen) libs=$dlprefiles ;;
-	link)
-	  libs="$deplibs %DEPLIBS%"
-	  test "X$link_all_deplibs" != Xno && libs="$libs $dependency_libs"
-	  ;;
+	link) libs="$deplibs %DEPLIBS% $dependency_libs" ;;
 	esac
       fi
       if test lib,dlpreopen = "$linkmode,$pass"; then
@@ -7983,19 +7980,19 @@ func_mode_link ()
 	    # It is a libtool convenience library, so add in its objects.
 	    func_append convenience " $ladir/$objdir/$old_library"
 	    func_append old_convenience " $ladir/$objdir/$old_library"
-	    tmp_libs=
-	    for deplib in $dependency_libs; do
-	      deplibs="$deplib $deplibs"
-	      if $opt_preserve_dup_deps; then
-		case "$tmp_libs " in
-		*" $deplib "*) func_append specialdeplibs " $deplib" ;;
-		esac
-	      fi
-	      func_append tmp_libs " $deplib"
-	    done
 	  elif test prog != "$linkmode" && test lib != "$linkmode"; then
 	    func_fatal_error "'$lib' is not a convenience library"
 	  fi
+	  tmp_libs=
+	  for deplib in $dependency_libs; do
+	    deplibs="$deplib $deplibs"
+	    if $opt_preserve_dup_deps; then
+	      case "$tmp_libs " in
+	      *" $deplib "*) func_append specialdeplibs " $deplib" ;;
+	      esac
+	    fi
+	    func_append tmp_libs " $deplib"
+	  done
 	  continue
 	fi # $pass = conv
 
diff --git a/configure b/configure
index 3750943b1..0edd1c6c3 100755
--- a/configure
+++ b/configure
@@ -31187,6 +31187,8 @@ as_fn_error $? "required header yaxt.h not found or not compilable
 See \`config.log' for more details" "$LINENO" 5; }
 fi
 
+   as_fn_append CFLAGS " $YAXT_C_INCLUDE"
+   LIBS="$YAXT_C_LIB $LIBS"
       defined_Xt_uid=no
    ac_fn_c_check_decl "$LINENO" "MPI_UINT64_T" "ac_cv_have_decl_MPI_UINT64_T" "$ac_includes_default
 #include <mpi.h>
@@ -31196,9 +31198,7 @@ if test "x$ac_cv_have_decl_MPI_UINT64_T" = xyes; then :
 
       defined_Xt_uid=yes
 else
-  as_fn_append CFLAGS " $YAXT_C_INCLUDE"
-      LIBS="$YAXT_C_LIB $LIBS"
-      # The cast to long int works around a bug in the HP C Compiler
+  # The cast to long int works around a bug in the HP C Compiler
 # version HP92453-01 B.11.11.23709.GP, which incorrectly rejects
 # declarations like `int a3[[(sizeof (unsigned char)) >= 0]];'.
 # This bug is HP SR number 8606223364.
@@ -31334,6 +31334,83 @@ as_fn_error $? "no way to communicate Xt_uid found
 See \`config.log' for more details" "$LINENO" 5; }
 fi
    { defined_Xt_uid=; unset defined_Xt_uid;}
+   ac_fn_c_check_func "$LINENO" "xt_redist_p2p_aext_new" "ac_cv_func_xt_redist_p2p_aext_new"
+if test "x$ac_cv_func_xt_redist_p2p_aext_new" = xyes; then :
+  # The cast to long int works around a bug in the HP C Compiler
+# version HP92453-01 B.11.11.23709.GP, which incorrectly rejects
+# declarations like `int a3[[(sizeof (unsigned char)) >= 0]];'.
+# This bug is HP SR number 8606223364.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking size of MPI_Aint" >&5
+$as_echo_n "checking size of MPI_Aint... " >&6; }
+if ${ac_cv_sizeof_MPI_Aint+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if ac_fn_c_compute_int "$LINENO" "(long int) (sizeof (MPI_Aint))" "ac_cv_sizeof_MPI_Aint"        "$ac_includes_default
+#include <mpi.h>
+"; then :
+
+else
+  if test "$ac_cv_type_MPI_Aint" = yes; then
+     { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error 77 "cannot compute sizeof (MPI_Aint)
+See \`config.log' for more details" "$LINENO" 5; }
+   else
+     ac_cv_sizeof_MPI_Aint=0
+   fi
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_sizeof_MPI_Aint" >&5
+$as_echo "$ac_cv_sizeof_MPI_Aint" >&6; }
+
+
+
+cat >>confdefs.h <<_ACEOF
+#define SIZEOF_MPI_AINT $ac_cv_sizeof_MPI_Aint
+_ACEOF
+
+
+      # The cast to long int works around a bug in the HP C Compiler
+# version HP92453-01 B.11.11.23709.GP, which incorrectly rejects
+# declarations like `int a3[[(sizeof (unsigned char)) >= 0]];'.
+# This bug is HP SR number 8606223364.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking size of int" >&5
+$as_echo_n "checking size of int... " >&6; }
+if ${ac_cv_sizeof_int+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if ac_fn_c_compute_int "$LINENO" "(long int) (sizeof (int))" "ac_cv_sizeof_int"        "$ac_includes_default"; then :
+
+else
+  if test "$ac_cv_type_int" = yes; then
+     { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+as_fn_error 77 "cannot compute sizeof (int)
+See \`config.log' for more details" "$LINENO" 5; }
+   else
+     ac_cv_sizeof_int=0
+   fi
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_sizeof_int" >&5
+$as_echo "$ac_cv_sizeof_int" >&6; }
+
+
+
+cat >>confdefs.h <<_ACEOF
+#define SIZEOF_INT $ac_cv_sizeof_int
+_ACEOF
+
+
+      if test $ac_cv_sizeof_MPI_Aint -gt $ac_cv_sizeof_int; then :
+
+$as_echo "#define USE_XT_REDIST_P2P_AEXT_NEW 1" >>confdefs.h
+
+fi
+fi
+
    CFLAGS=$saved_CFLAGS
    LIBS=$saved_LIBS
 
diff --git a/configure.ac b/configure.ac
index 11a799591..f8d4aa12c 100644
--- a/configure.ac
+++ b/configure.ac
@@ -397,16 +397,17 @@ dnl whatever pkg-config finds will be used instead.
         [AC_MSG_FAILURE([cannot link C YAXT programs])],,[$YAXT_C_LIB])],
      [AC_MSG_FAILURE([required header yaxt.h not found or not compilable])],,
      [$YAXT_C_INCLUDE],[[]])
-dnl
+dnl the following tests can rely on having yaxt.h and C libs available
+   AS_VAR_APPEND([CFLAGS], [" $YAXT_C_INCLUDE"])
+   LIBS="$YAXT_C_LIB $LIBS"
+dnl determine how to xfer Xt_uid values by MPI
    AH_TEMPLATE([YAXT_UID_DT],
      [Defined to MPI datatype to be used for Xt_uid])dnl
    defined_Xt_uid=no
    AC_CHECK_DECL([MPI_UINT64_T],
      [AC_DEFINE([YAXT_UID_DT], [MPI_UINT64_T])
       defined_Xt_uid=yes],
-     [AS_VAR_APPEND([CFLAGS], [" $YAXT_C_INCLUDE"])
-      LIBS="$YAXT_C_LIB $LIBS"
-      AC_CHECK_SIZEOF([Xt_uid],,
+     [AC_CHECK_SIZEOF([Xt_uid],,
         [AC_INCLUDES_DEFAULT
 @%:@include <yaxt.h>])
       AC_CHECK_SIZEOF([unsigned long])
@@ -425,6 +426,13 @@ dnl
    AS_VAR_IF([defined_Xt_uid], [no],
      [AC_MSG_FAILURE([no way to communicate Xt_uid found])])
    AS_UNSET([defined_Xt_uid])
+dnl determine if xt_redist_p2p_aext_new is available
+   AC_CHECK_FUNC([xt_redist_p2p_aext_new],
+     [AC_CHECK_SIZEOF([MPI_Aint],,[AC_INCLUDES_DEFAULT
+@%:@include <mpi.h>])
+      AC_CHECK_SIZEOF([int])
+      AS_IF([test $ac_cv_sizeof_MPI_Aint -gt $ac_cv_sizeof_int],
+        [AC_DEFINE([USE_XT_REDIST_P2P_AEXT_NEW],[1],[Defined if aext alternatives should be tested])])])
    CFLAGS=$saved_CFLAGS
    LIBS=$saved_LIBS
 
diff --git a/src/config.h.in b/src/config.h.in
index 4f24a77de..7807f9508 100644
--- a/src/config.h.in
+++ b/src/config.h.in
@@ -253,6 +253,12 @@
    your system. */
 #undef PTHREAD_CREATE_JOINABLE
 
+/* The size of `int', as computed by sizeof. */
+#undef SIZEOF_INT
+
+/* The size of `MPI_Aint', as computed by sizeof. */
+#undef SIZEOF_MPI_AINT
+
 /* The size of `unsigned long', as computed by sizeof. */
 #undef SIZEOF_UNSIGNED_LONG
 
@@ -274,6 +280,9 @@
 /* User name */
 #undef USER_NAME
 
+/* Defined if aext alternatives should be tested */
+#undef USE_XT_REDIST_P2P_AEXT_NEW
+
 /* Version number of package */
 #undef VERSION
 
diff --git a/src/pio_server.c b/src/pio_server.c
index 4a3a67a9f..4106fb706 100644
--- a/src/pio_server.c
+++ b/src/pio_server.c
@@ -284,7 +284,7 @@ resizeVarGatherBuf(size_t size, void **buf, size_t *bufSize)
 #define wHECast(buf) ((struct winHeaderEntry *)(void *)buf)
 
 static Xt_xmap
-buildVarXmap(struct Xt_offset_ext *restrict partExts,
+buildVarXmap(int *restrict partSizes,
              const struct clientBuf *restrict clientBuf, size_t headerIdx,
              Xt_idxlist dstList,
              Xt_idxlist *partDescPreset,
@@ -297,8 +297,6 @@ buildVarXmap(struct Xt_offset_ext *restrict partExts,
   Xt_idxlist *part
     = partDescPreset ? partDescPreset : Malloc(numClients * sizeof (part[0]));
   int conversion = (wHECast(clientBuf[0].mem))[headerIdx].id;
-  size_t elemSize
-    = conversion == DATA_HEADER_FLOAT ? sizeof (float) : sizeof (double);
   for (size_t clientIdx = 0; clientIdx < numClients; ++clientIdx)
     {
       unsigned char *clientMem = clientBuf[clientIdx].mem;
@@ -319,17 +317,7 @@ buildVarXmap(struct Xt_offset_ext *restrict partExts,
             = xt_idxlist_unpack(clientMem, (int)clientBuf[clientIdx].size,
                                 &position, pioInterComm);
         }
-      unsigned partSize
-        = (unsigned)xt_idxlist_get_num_indices(part[clientIdx]);
-      size_t charOfs = (size_t)((clientMem
-                                 + (wHECast(clientMem))[headerIdx].offset)
-                                - clientBuf[0].mem);
-      xassert(charOfs % elemSize == 0
-              && charOfs / elemSize + partSize <= INT_MAX);
-      int elemOfs = (int)(charOfs / elemSize);
-      partExts[clientIdx].start = elemOfs;
-      partExts[clientIdx].size = (int)partSize;
-      partExts[clientIdx].stride = 1;
+      partSizes[clientIdx+1] = xt_idxlist_get_num_indices(part[clientIdx]);
     }
   Xt_idxlist srcList = xt_idxlist_collection_new(part, (int)numClients);
   if (!partDescPreset)
@@ -818,6 +806,82 @@ buildDecoPresetXmaps(int streamID, struct partDescPreset clientDeco,
     }
 }
 
+struct partExtRes
+{
+  void *partExts;
+  bool needAExt;
+};
+
+static struct partExtRes
+fillPartExts(const int *partSizes,
+             const struct clientBuf *restrict clientBuf,
+             size_t headerIdx)
+{
+  int conversion = (wHECast(clientBuf[0].mem))[headerIdx].id;
+  int varID = wHECast(clientBuf[0].mem)[headerIdx].specific.dataRecord.varID;
+  size_t elemSize
+    = conversion == DATA_HEADER_FLOAT ? sizeof (float) : sizeof (double);
+#ifdef USE_XT_REDIST_P2P_AEXT_NEW
+  struct Xt_aoffset_ext *partAExts = NULL;
+  bool needAExt = false;
+#endif
+  size_t numClients = (size_t)numClients_;
+  struct partExtRes extRes;
+  struct Xt_offset_ext *partExts = extRes.partExts
+    = Malloc(numClients * sizeof (partExts[0]));
+#ifdef USE_XT_REDIST_P2P_AEXT_NEW
+  for (size_t clientIdx = 0; clientIdx < numClients; ++clientIdx)
+    {
+      unsigned char *clientMem = clientBuf[clientIdx].mem;
+      xassert((wHECast(clientMem))[headerIdx].specific.dataRecord.varID == varID
+              && (wHECast(clientMem))[headerIdx].id == conversion);
+      size_t charOfs = (size_t)((clientMem
+                                 + (wHECast(clientMem))[headerIdx].offset)
+                                - clientBuf[0].mem);
+      int partSize = partSizes[clientIdx+1];
+      // FIXME: prevent overflow in second expression
+      needAExt |= (charOfs % elemSize != 0)
+        | (charOfs + (size_t)partSize * elemSize
+           > (size_t)INT_MAX * elemSize);
+    }
+  extRes.needAExt = needAExt;
+  if (needAExt)
+    {
+      extRes.partExts = partAExts
+        = Realloc(partExts, numClients * sizeof (partAExts[0]));
+      partExts = NULL;
+    }
+#endif
+  for (size_t clientIdx = 0; clientIdx < numClients; ++clientIdx)
+    {
+      unsigned char *clientMem = clientBuf[clientIdx].mem;
+      size_t charOfs = (size_t)((clientMem
+                                 + (wHECast(clientMem))[headerIdx].offset)
+                                - clientBuf[0].mem);
+      int partSize = partSizes[clientIdx+1];
+#ifndef USE_XT_REDIST_P2P_AEXT_NEW
+      xassert(charOfs % elemSize == 0
+              && charOfs / elemSize + (size_t)partSize <= INT_MAX);
+#else
+      if (needAExt)
+        {
+          partAExts[clientIdx].start = (MPI_Aint)charOfs;
+          partAExts[clientIdx].size = partSize;
+          partAExts[clientIdx].stride = (MPI_Aint)elemSize;
+        }
+      else
+#endif
+        {
+          int elemOfs = (int)(charOfs / elemSize);
+          partExts[clientIdx].start = elemOfs;
+          partExts[clientIdx].size = partSize;
+          partExts[clientIdx].stride = 1;
+        }
+    }
+  return extRes;
+}
+
+
 static Xt_redist
 buildVarRedist(int headerIdx, size_t streamIdx,
                /* index list representing the data elements gathered on
@@ -833,7 +897,6 @@ buildVarRedist(int headerIdx, size_t streamIdx,
   size_t elemSize
     = conversion == DATA_HEADER_FLOAT ? sizeof (float) : sizeof (double);
   size_t numClients = (size_t)numClients_;
-  struct Xt_offset_ext *partExts = Malloc(numClients * sizeof (partExts[0]));
   MPI_Comm pioInterComm = cdiPioInqInterComm(),
     collComm = commInqCommColl();
   Xt_xmap gatherXmap;
@@ -842,11 +905,13 @@ buildVarRedist(int headerIdx, size_t streamIdx,
         .size = xt_idxlist_get_num_indices(dstList),
         .stride = 1 };
   Xt_uid *restrict uids = NULL;
+  void *tmpBuf = NULL;
   int *restrict partSizes = NULL;
   bool cacheXmaps = conf->cacheXmaps;
   if (cacheXmaps)
     {
       allocUIDLookup(numClients, &uids, &partSizes);
+      tmpBuf = uids;
       uids[0] = xt_idxlist_get_uid(dstList);
       for (size_t clientIdx = 0; clientIdx < numClients; ++clientIdx)
         {
@@ -859,47 +924,48 @@ buildVarRedist(int headerIdx, size_t streamIdx,
         }
       if ((gatherXmap = cdiPioXmapCacheLookup(XmapCache, uids, partSizes)))
         {
-          for (size_t clientIdx = 0; clientIdx < numClients; ++clientIdx)
-            {
-              unsigned char *clientMem = clientBuf[clientIdx].mem;
-              struct dataRecord *dataHeader
-                = &(wHECast(clientMem))[headerIdx].specific.dataRecord;
-              xassert(dataHeader->varID == varID
-                      && (wHECast(clientMem))[headerIdx].id == conversion);
-              size_t charOfs = (size_t)((clientMem
-                                         + (wHECast(clientMem))[headerIdx].offset)
-                                        - clientBuf[0].mem);
-              int partSize = partSizes[clientIdx+1];
-              xassert(charOfs % elemSize == 0
-                      && charOfs / elemSize + (size_t)partSize <= INT_MAX);
-              int elemOfs = (int)(charOfs / elemSize);
-              partExts[clientIdx].start = elemOfs;
-              partExts[clientIdx].size = partSize;
-              partExts[clientIdx].stride = 1;
-            }
-          goto finishXmapCaching;
+          goto afterNewXmapCacheEntry;
         }
     }
-  gatherXmap = buildVarXmap(partExts, clientBuf, (size_t)headerIdx,
+  else
+    {
+      tmpBuf = partSizes = Malloc(sizeof (*partSizes) * (numClients + 1));
+    }
+  partSizes[0] = gatherExt.size;
+  gatherXmap = buildVarXmap(partSizes, clientBuf, (size_t)headerIdx,
                             dstList, partDescPreset, pioInterComm, collComm,
                             varID, conf);
   if (cacheXmaps)
     {
-      partSizes[0] = gatherExt.size;
-      for (size_t i = 0; i < numClients; ++i)
-        partSizes[i+1] = partExts[i].size;
       cdiPioXmapCacheAdd(XmapCache, uids, partSizes, gatherXmap);
-      finishXmapCaching:
-      Free(uids);
     }
+  afterNewXmapCacheEntry:;
+  struct partExtRes extRes
+    = fillPartExts(partSizes, clientBuf, (size_t)headerIdx);
+  Free(tmpBuf);
   MPI_Datatype elemDt
     = conversion ==  DATA_HEADER_FLOAT ? MPI_FLOAT : MPI_DOUBLE;
-  Xt_redist varRedist
-    = xt_redist_p2p_ext_new(gatherXmap, (int)numClients, partExts, 1,
-                            &gatherExt, elemDt);
+  Xt_redist varRedist;
+#if USE_XT_REDIST_P2P_AEXT_NEW
+  if (extRes.needAExt)
+    {
+      struct Xt_aoffset_ext gatherAExt
+        = { .start = 0,
+            .size = gatherExt.size,
+            .stride = (MPI_Aint)elemSize };
+
+      varRedist
+        = xt_redist_p2p_aext_new(gatherXmap, (int)numClients, extRes.partExts,
+                                 1, &gatherAExt, elemDt);
+    }
+  else
+#endif
+    varRedist
+      = xt_redist_p2p_ext_new(gatherXmap, (int)numClients, extRes.partExts,
+                              1, &gatherExt, elemDt);
   if (!cacheXmaps)
     xt_xmap_delete(gatherXmap);
-  Free(partExts);
+  Free(extRes.partExts);
   return varRedist;
 }
 
-- 
GitLab