Commit ede135a4 authored by Thomas Jahns's avatar Thomas Jahns 🤸
Browse files

Fix check of MPI_Abort exits.

* Some platforms do not use the POSIX exit code, but at least have the
  decency to give a textual report we can use in the test framework.
* Some totally fail to report this on their own. For those systems a
  work-around was implemented.
parent 577dd7f7
Pipeline #2469 passed with stages
in 37 seconds
......@@ -19,6 +19,8 @@ EXTRA_DIST = util/sunf95preproc-wrapper \
config/checkdoc/openmpi_dup.txt \
config/checksrc/pe5228_iv32584.c \
config/checkdoc/pe5228_iv32584.txt \
config/checkdoc/incorrect_mpi_abort_exitcode.c \
config/checkdoc/incorrect_mpi_abort_exitcode.txt \
contrib/00nagfor-libtool-patch/README.txt \
contrib/00nagfor-libtool-patch/nagfor-libtool.patch \
contrib/00nagfor-libtool-patch/nagfor-libtool-2.4.2.patch \
......
Your installations does not report the errorcode argument of MPI_Abort
to the calling environment as POSIX exit code. Inspect config.log for
further details of the failure.
/**
* @file intelmpi_datatype.c
* @brief demonstrates a problem some IntelMPI and MVAPICH2 versions have with
* transferring some data layouts
*
* @copyright Copyright (C) 2019 Moritz Hanke <hanke@dkrz.de>
*
* @author Moritz Hanke <hanke@dkrz.de>
*/
/*
* Keywords:
* Maintainer: Moritz Hanke <hanke@dkrz.de>
* Thomas Jahns <jahns@dkrz.de>
* URL: https://doc.redmine.dkrz.de/yaxt/html/
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* Neither the name of the DKRZ GmbH nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
* IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
* OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/**
*
* This program calls MPI_Abort to test whether the exit code
* parameter is passed through the calling process.
* acx_mpirun_expected_exitcode=2
*/
#include <stdlib.h>
#include <mpi.h>
int main(void) {
MPI_Init(NULL,NULL);
MPI_Abort(MPI_COMM_WORLD, 2);
/* This should not execute, ever */
MPI_Finalize();
return EXIT_SUCCESS;
}
......@@ -270,6 +270,13 @@ ACX_MPIRUN(,[saved_MPI_LAUNCH=$MPI_LAUNCH],
[AC_MSG_FAILURE([unable to find a working MPI launch program, which is
required for checks for known MPI defects (see --without-regard-for-quality)])],
[saved_MPI_LAUNCH=$MPI_LAUNCH ; MPI_LAUNCH=true])])
dnl do not remove conftest.err we need to inspect the output
m4_pushdef([_AC_RUN_LOG_LIMIT],
m4_bpatsubst(m4_dquote(m4_defn([_AC_RUN_LOG_LIMIT])),
[rm -f conftest.er1 conftest.err],[rm -f conftest.er1]))dnl
m4_pushdef([_AC_LINK_IFELSE],
m4_bpatsubst(m4_dquote(m4_defn([_AC_LINK_IFELSE])),
[rm -f core conftest.err],[rm -f core]))dnl
ACX_MPI_DEFECTS(,,
[ACX_MPI_DEFECTS_DOCUMENT
AS_CASE(["$acx_subtestname"],[openmpi_datatype],
......@@ -277,6 +284,13 @@ ACX_MPI_DEFECTS(,,
ACX_OMPI_DT_WORKAROUND(,[openmpi_datatype.c],
[xt_mpi_workaround_LIBS=$ac_cv_search_opal_output
extrasub="${extrasub+$extrasub$as_nl}/^\# skip internal symbols for stubs created by xlf/i \\\\${as_nl}\# ignore opal symbols overriden by us\\\\${as_nl}/"'^\${exp_sym_prefix}opal_datatype_\\\\(commit\\\\|add\\\\)/b'])],
[incorrect_mpi_abort_exitcode],
[AC_MSG_WARN([Your installation drops the exit code of MPI_Abort!
Consider getting a better MPI/batch scheduler.])
AS_IF([grep -v -i 'mpi_abort(@<:@^,()@:>@*, 2)' conftest.err >/dev/null],
[AC_MSG_NOTICE([Implementing fragile work-around for tests!])
AC_DEFINE([XT_NEED_MPI_ABORT_WORK_AROUND],[1],
[To pass the value for MPI_Abort to the surrounding execution environment, a hack is needed.])])],
[AS_VAR_IF([with_regard_for_quality],[yes],
[AC_MSG_FAILURE([test for known defect $acx_subtestname failed,
re-configure with --without-regard-for-quality in case you must use the
......
......@@ -106,12 +106,21 @@ q
AS_IF([test `expr "$acx_mpirun_num_tasks" : "@<:@0-9@:>@@<:@0-9@:>@*$"` -gt 0 \
&& test "$acx_mpirun_num_tasks" -gt 0],,
[acx_mpirun_num_tasks=1])
acx_mpirun_expected_exitcode=`sed -n '/acx_mpirun_expected_exitcode *= *\(@<:@0-9@:>@*\)/{
s/.*acx_mpirun_expected_exitcode *= *\(@<:@0-9@:>@*\).*/\1/
p
q
}
' "acx_mpi_check_src_"`
AS_IF([test `expr "$acx_mpirun_expected_exitcode" : "@<:@0-9@:>@@<:@0-9@:>@*$"` -gt 0 \
&& test "$acx_mpirun_expected_exitcode" -gt 0],,
[acx_mpirun_expected_exitcode=0])
AC_LINK_IFELSE(,
[acx_mpirun_num_tasks="$MPI_LAUNCH -n $acx_mpirun_num_tasks ./conftest$EXEEXT"
AS_IF([expr "$ac_link" : '.*/libtool --mode=link' >/dev/null],
[acx_mpirun_test=`echo "$ac_link" | sed -e 's@\(.*/libtool --mode=\)link.*@\1@'`"execute $acx_mpirun_test"])
_AC_RUN_LOG_LIMIT([LIBC_FATAL_STDERR_=1 $acx_mpirun_num_tasks >&2],[echo "running $acx_mpirun_num_tasks"])
AS_IF([test $ac_status -eq 0],
AS_IF([test $ac_status -eq $acx_mpirun_expected_exitcode],
[acx_mpi_defects_result=okay; acx_mpi_defects_fail=no]m4_ifval([$2],[
$2]),
[acx_mpi_defects_result=error ; acx_mpi_defects_fail=yes])],
......
......@@ -49,7 +49,13 @@
#include <assert.h>
#include <string.h>
#ifdef XT_NEED_MPI_ABORT_WORK_AROUND
# include <fcntl.h>
# include <sys/stat.h>
# include <sys/types.h>
#endif
#include <unistd.h>
#include <mpi.h>
#include <yaxt.h>
......@@ -151,6 +157,13 @@ static void
xfail_abort(MPI_Comm comm, const char *msg, const char *source, int line)
{
fprintf(stderr, "Fatal error in %s, line %d: %s\n", source, line, msg);
#ifdef XT_NEED_MPI_ABORT_WORK_AROUND
int fd = open("test_xmap_all2all_fail.result.txt",
O_WRONLY | O_TRUNC | O_CREAT | O_NOCTTY, 0777);
static const char exit_msg[] = "exited with code 3\n";
write(fd, exit_msg, sizeof (exit_msg));
close(fd);
#endif
MPI_Abort(comm, 3);
abort();
}
......
......@@ -6,15 +6,19 @@ export LIBC_FATAL_STDERR_
ulimit -c 0
for setup_size in small big; do
for suffix in '' '_f'; do
if @abs_top_builddir@/libtool --mode=execute \
if diags=`@abs_top_builddir@/libtool --mode=execute \
@MPI_LAUNCH@ -n 1 \
@abs_builddir@/test_xmap_all2all_fail$suffix -s $setup_size \
>/dev/null 2>&1; then
2>&1`; then
exit 1
else
rc=$?
if [ $rc -ne 3 ]; then
exit 1
if echo x"$diags" | grep -i 'mpi_abort([^,()]*, 3)' >/dev/null ; then
continue
else
exit 1
fi
fi
fi
done
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment