From f91a8f0197f2807831c7cc8af7bf76c2041c310f Mon Sep 17 00:00:00 2001 From: Uwe Schulzweida <uwe.schulzweida@mpimet.mpg.de> Date: Sun, 5 Nov 2023 11:35:20 +0100 Subject: [PATCH] varray_*mv(): add pure simd version --- src/field.cc | 10 +-- src/varray.cc | 214 +++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 190 insertions(+), 34 deletions(-) diff --git a/src/field.cc b/src/field.cc index 31dbe93a9..988431e2f 100644 --- a/src/field.cc +++ b/src/field.cc @@ -44,7 +44,7 @@ Field::resize(const size_t count) } void -Field::resize(const size_t count, const double value) +Field::resize(const size_t count, double value) { memType = MemType::Double; m_count = count; @@ -182,7 +182,7 @@ class valueDblIsEqual public: explicit valueDblIsEqual(double missval) : _missval(missval) {} bool - operator()(const double value) const + operator()(double value) const { return dbl_is_equal(value, _missval); } @@ -196,7 +196,7 @@ class valueIsEqual public: explicit valueIsEqual(double missval) : _missval(missval) {} bool - operator()(const double value) const + operator()(double value) const { return is_equal(value, _missval); } @@ -465,7 +465,7 @@ else template <typename T> double -array_pctl(size_t len, T *array, size_t nmiss, T missval, const double pn) +array_pctl(size_t len, T *array, size_t nmiss, T missval, double pn) { double pctl = missval; @@ -492,7 +492,7 @@ array_pctl(size_t len, T *array, size_t nmiss, T missval, const double pn) } double -field_pctl(Field &field, const double pn) +field_pctl(Field &field, double pn) { if (field.memType == MemType::Float) return array_pctl(field.size, field.vec_f.data(), field.nmiss, (float) field.missval, pn); diff --git a/src/varray.cc b/src/varray.cc index d8eab4c5f..3d718fb83 100644 --- a/src/varray.cc +++ b/src/varray.cc @@ -84,12 +84,24 @@ varray_min_max_sum(size_t len, const Varray<T> &v, MinMaxSum mms) auto vmax = mms.max; auto vsum = mms.sum; + if (len > cdoMinLoopSize) + { +#ifndef __ICC // wrong result with icc19 +#ifdef HAVE_OPENMP4 +#pragma omp parallel for simd default(shared) schedule(static) reduction(min : vmin) reduction(max : vmax) reduction(+ : vsum) +#endif +#endif + for (size_t i = 0; i < len; ++i) f_minmaxsum((double) v[i], vmin, vmax, vsum); + } + else + { #ifndef __ICC // wrong result with icc19 #ifdef HAVE_OPENMP4 -#pragma omp parallel for simd if (len > cdoMinLoopSize) default(shared) schedule(static) reduction(min : vmin) reduction(max : vmax) reduction(+ : vsum) +#pragma omp simd reduction(min : vmin) reduction(max : vmax) reduction(+ : vsum) #endif #endif - for (size_t i = 0; i < len; ++i) f_minmaxsum((double) v[i], vmin, vmax, vsum); + for (size_t i = 0; i < len; ++i) f_minmaxsum((double) v[i], vmin, vmax, vsum); + } return MinMaxSum(vmin, vmax, vsum, len); } @@ -463,21 +475,45 @@ varray_min_mv(size_t len, const Varray<T> &v, T missval) if (std::isnan(missval)) { + if (len > cdoMinLoopSize) + { +#ifndef __ICC // internal error with icc22: lambda not supported +#ifdef HAVE_OPENMP4 +#pragma omp parallel for simd default(shared) schedule(static) reduction(min : vmin) +#endif +#endif + for (size_t i = 0; i < len; ++i) f_min_mv(v[i], missval, vmin, dbl_is_equal); + } + else + { #ifndef __ICC // internal error with icc22: lambda not supported #ifdef HAVE_OPENMP4 -#pragma omp parallel for if (len > cdoMinLoopSize) default(shared) schedule(static) reduction(min : vmin) +#pragma omp simd reduction(min : vmin) #endif #endif - for (size_t i = 0; i < len; ++i) f_min_mv(v[i], missval, vmin, dbl_is_equal); + for (size_t i = 0; i < len; ++i) f_min_mv(v[i], missval, vmin, dbl_is_equal); + } } else { + if (len > cdoMinLoopSize) + { +#ifndef __ICC // internal error with icc22: lambda not supported +#ifdef HAVE_OPENMP4 +#pragma omp parallel for simd default(shared) schedule(static) reduction(min : vmin) +#endif +#endif + for (size_t i = 0; i < len; ++i) f_min_mv(v[i], missval, vmin, is_equal); + } + else + { #ifndef __ICC // internal error with icc22: lambda not supported #ifdef HAVE_OPENMP4 -#pragma omp parallel for if (len > cdoMinLoopSize) default(shared) schedule(static) reduction(min : vmin) +#pragma omp simd reduction(min : vmin) #endif #endif - for (size_t i = 0; i < len; ++i) f_min_mv(v[i], missval, vmin, is_equal); + for (size_t i = 0; i < len; ++i) f_min_mv(v[i], missval, vmin, is_equal); + } } if (is_equal(vmin, std::numeric_limits<T>::max())) vmin = missval; @@ -505,21 +541,45 @@ varray_max_mv(size_t len, const Varray<T> &v, T missval) if (std::isnan(missval)) { + if (len > cdoMinLoopSize) + { +#ifndef __ICC // internal error with icc22: lambda not supported +#ifdef HAVE_OPENMP4 +#pragma omp parallel for simd default(shared) schedule(static) reduction(max : vmax) +#endif +#endif + for (size_t i = 0; i < len; ++i) f_max_mv(v[i], missval, vmax, dbl_is_equal); + } + else + { #ifndef __ICC // internal error with icc22: lambda not supported #ifdef HAVE_OPENMP4 -#pragma omp parallel for if (len > cdoMinLoopSize) default(shared) schedule(static) reduction(max : vmax) +#pragma omp simd reduction(max : vmax) #endif #endif - for (size_t i = 0; i < len; ++i) f_max_mv(v[i], missval, vmax, dbl_is_equal); + for (size_t i = 0; i < len; ++i) f_max_mv(v[i], missval, vmax, dbl_is_equal); + } } else { + if (len > cdoMinLoopSize) + { +#ifndef __ICC // internal error with icc22: lambda not supported +#ifdef HAVE_OPENMP4 +#pragma omp parallel for simd default(shared) schedule(static) reduction(max : vmax) +#endif +#endif + for (size_t i = 0; i < len; ++i) f_max_mv(v[i], missval, vmax, is_equal); + } + else + { #ifndef __ICC // internal error with icc22: lambda not supported #ifdef HAVE_OPENMP4 -#pragma omp parallel for if (len > cdoMinLoopSize) default(shared) schedule(static) reduction(max : vmax) +#pragma omp simd reduction(max : vmax) #endif #endif - for (size_t i = 0; i < len; ++i) f_max_mv(v[i], missval, vmax, is_equal); + for (size_t i = 0; i < len; ++i) f_max_mv(v[i], missval, vmax, is_equal); + } } if (is_equal(vmax, -std::numeric_limits<T>::max())) vmax = missval; @@ -552,21 +612,45 @@ varray_range_mv(size_t len, const Varray<T> &v, T missval) if (std::isnan(missval)) { + if (len > cdoMinLoopSize) + { +#ifndef __ICC // internal error with icc22: lambda not supported +#ifdef HAVE_OPENMP4 +#pragma omp parallel for simd default(shared) schedule(static) reduction(min : vmin) reduction(max : vmax) +#endif +#endif + for (size_t i = 0; i < len; ++i) f_minmax_mv(v[i], missval, vmin, vmax, dbl_is_equal); + } + else + { #ifndef __ICC // internal error with icc22: lambda not supported #ifdef HAVE_OPENMP4 -#pragma omp parallel for if (len > cdoMinLoopSize) default(shared) schedule(static) reduction(min : vmin) reduction(max : vmax) +#pragma omp simd reduction(min : vmin) reduction(max : vmax) #endif #endif - for (size_t i = 0; i < len; ++i) f_minmax_mv(v[i], missval, vmin, vmax, dbl_is_equal); + for (size_t i = 0; i < len; ++i) f_minmax_mv(v[i], missval, vmin, vmax, dbl_is_equal); + } } else { + if (len > cdoMinLoopSize) + { +#ifndef __ICC // internal error with icc22: lambda not supported +#ifdef HAVE_OPENMP4 +#pragma omp parallel for simd default(shared) schedule(static) reduction(min : vmin) reduction(max : vmax) +#endif +#endif + for (size_t i = 0; i < len; ++i) f_minmax_mv(v[i], missval, vmin, vmax, is_equal); + } + else + { #ifndef __ICC // internal error with icc22: lambda not supported #ifdef HAVE_OPENMP4 -#pragma omp parallel for if (len > cdoMinLoopSize) default(shared) schedule(static) reduction(min : vmin) reduction(max : vmax) +#pragma omp simd reduction(min : vmin) reduction(max : vmax) #endif #endif - for (size_t i = 0; i < len; ++i) f_minmax_mv(v[i], missval, vmin, vmax, is_equal); + for (size_t i = 0; i < len; ++i) f_minmax_mv(v[i], missval, vmin, vmax, is_equal); + } } return (is_equal(vmin, std::numeric_limits<T>::max()) && is_equal(vmax, -std::numeric_limits<T>::max())) ? missval : vmax - vmin; @@ -637,21 +721,45 @@ varray_sum_mv(size_t len, const Varray<T> &v, T missval) if (std::isnan(missval)) { + if (len > cdoMinLoopSize) + { +#ifndef __ICC // internal error with icc22: lambda not supported +#ifdef HAVE_OPENMP4 +#pragma omp parallel for simd default(shared) schedule(static) reduction(+ : sum, nvals) +#endif +#endif + for (size_t i = 0; i < len; ++i) f_sum_mv(v[i], missval, sum, nvals, dbl_is_equal); + } + else + { #ifndef __ICC // internal error with icc22: lambda not supported #ifdef HAVE_OPENMP4 -#pragma omp parallel for if (len > cdoMinLoopSize) default(shared) schedule(static) reduction(+ : sum, nvals) +#pragma omp simd reduction(+ : sum, nvals) #endif #endif - for (size_t i = 0; i < len; ++i) f_sum_mv(v[i], missval, sum, nvals, dbl_is_equal); + for (size_t i = 0; i < len; ++i) f_sum_mv(v[i], missval, sum, nvals, dbl_is_equal); + } } else { + if (len > cdoMinLoopSize) + { +#ifndef __ICC // internal error with icc22: lambda not supported +#ifdef HAVE_OPENMP4 +#pragma omp parallel for simd default(shared) schedule(static) reduction(+ : sum, nvals) +#endif +#endif + for (size_t i = 0; i < len; ++i) f_sum_mv(v[i], missval, sum, nvals, is_equal); + } + else + { #ifndef __ICC // internal error with icc22: lambda not supported #ifdef HAVE_OPENMP4 -#pragma omp parallel for if (len > cdoMinLoopSize) default(shared) schedule(static) reduction(+ : sum, nvals) +#pragma omp simd reduction(+ : sum, nvals) #endif #endif - for (size_t i = 0; i < len; ++i) f_sum_mv(v[i], missval, sum, nvals, is_equal); + for (size_t i = 0; i < len; ++i) f_sum_mv(v[i], missval, sum, nvals, is_equal); + } } if (!nvals) sum = missval; @@ -771,23 +879,47 @@ varray_weighted_mean_mv(size_t len, const Varray<T> &v, const Varray<double> &w, if (std::isnan(missval)) { auto is_EQ = dbl_is_equal; + if (len > cdoMinLoopSize) + { +#ifndef __ICC // internal error with icc22: lambda not supported +#ifdef HAVE_OPENMP4 +#pragma omp parallel for default(shared) schedule(static) reduction(+ : sum, sumw) +#endif +#endif + for (size_t i = 0; i < len; ++i) f_weighted_mean_mv(w[i], v[i], missval1, sum, sumw, is_EQ); + } + else + { #ifndef __ICC // internal error with icc22: lambda not supported #ifdef HAVE_OPENMP4 -#pragma omp parallel for if (len > cdoMinLoopSize) default(shared) schedule(static) reduction(+ : sum, sumw) +#pragma omp simd reduction(+ : sum, sumw) #endif #endif - for (size_t i = 0; i < len; ++i) f_weighted_mean_mv(w[i], v[i], missval1, sum, sumw, is_EQ); + for (size_t i = 0; i < len; ++i) f_weighted_mean_mv(w[i], v[i], missval1, sum, sumw, is_EQ); + } wmean = DIVM(sum, sumw); } else { auto is_EQ = is_equal; + if (len > cdoMinLoopSize) + { #ifndef __ICC // internal error with icc22: lambda not supported #ifdef HAVE_OPENMP4 -#pragma omp parallel for if (len > cdoMinLoopSize) default(shared) schedule(static) reduction(+ : sum, sumw) +#pragma omp parallel for simd default(shared) schedule(static) reduction(+ : sum, sumw) +#endif +#endif + for (size_t i = 0; i < len; ++i) f_weighted_mean_mv(w[i], v[i], missval1, sum, sumw, is_EQ); + } + else + { +#ifndef __ICC // internal error with icc22: lambda not supported +#ifdef HAVE_OPENMP4 +#pragma omp simd reduction(+ : sum, sumw) #endif #endif - for (size_t i = 0; i < len; ++i) f_weighted_mean_mv(w[i], v[i], missval1, sum, sumw, is_EQ); + for (size_t i = 0; i < len; ++i) f_weighted_mean_mv(w[i], v[i], missval1, sum, sumw, is_EQ); + } wmean = DIVM(sum, sumw); } @@ -886,12 +1018,24 @@ varray_prevarsum0_mv(size_t len, const Varray<T> &v, double missval, double &rsu rsum = rsumw = 0.0; + if (len > cdoMinLoopSize) + { +#ifndef __ICC // internal error with icc22: lambda not supported +#ifdef HAVE_OPENMP4 +#pragma omp parallel for simd default(shared) schedule(static) reduction(+ : rsum, rsumw) +#endif +#endif + for (size_t i = 0; i < len; ++i) f_prevarsum0_mv(v[i], missval, rsum, rsumw); + } + else + { #ifndef __ICC // internal error with icc22: lambda not supported #ifdef HAVE_OPENMP4 -#pragma omp parallel for if (len > cdoMinLoopSize) default(shared) schedule(static) reduction(+ : rsum, rsumw) +#pragma omp simd reduction(+ : rsum, rsumw) #endif #endif - for (size_t i = 0; i < len; ++i) f_prevarsum0_mv(v[i], missval, rsum, rsumw); + for (size_t i = 0; i < len; ++i) f_prevarsum0_mv(v[i], missval, rsum, rsumw); + } } template <typename T> @@ -1044,12 +1188,24 @@ varray_weighted_prevarsum_mv(size_t len, const Varray<T> &v, const Varray<double rsum = rsumq = rsumw = rsumwq = 0.0; + if (len > cdoMinLoopSize) + { #ifndef __ICC // internal error with icc22: lambda not supported #ifdef HAVE_OPENMP4 -#pragma omp parallel for if (len > cdoMinLoopSize) default(shared) schedule(static) reduction(+ : rsum, rsumq, rsumw, rsumwq) +#pragma omp parallel for simd default(shared) schedule(static) reduction(+ : rsum, rsumq, rsumw, rsumwq) +#endif #endif + for (size_t i = 0; i < len; ++i) f_weighted_prevarsum_mv(w[i], (double) v[i], missval, rsum, rsumq, rsumw, rsumwq); + } + else + { +#ifndef __ICC // internal error with icc22: lambda not supported +#ifdef HAVE_OPENMP4 +#pragma omp simd reduction(+ : rsum, rsumq, rsumw, rsumwq) #endif - for (size_t i = 0; i < len; ++i) f_weighted_prevarsum_mv(w[i], (double) v[i], missval, rsum, rsumq, rsumw, rsumwq); +#endif + for (size_t i = 0; i < len; ++i) f_weighted_prevarsum_mv(w[i], (double) v[i], missval, rsum, rsumq, rsumw, rsumwq); + } } template <typename T> @@ -1129,7 +1285,7 @@ template <typename T> static void varray_prekurtsum_mv(size_t len, const Varray<T> &v, T missval, double mean, double &rsum3w, double &rsum2diff, double &rsum4diff) { - auto f_preskewsum_mv = [](auto a, auto mv_a, auto meanval, auto &sum2diff, auto &sum4diff, auto &sum3w) { + auto f_prekurtsum_mv = [](auto a, auto mv_a, auto meanval, auto &sum2diff, auto &sum4diff, auto &sum3w) { if (!dbl_is_equal(a, mv_a)) { double vdiff = a - meanval; @@ -1148,7 +1304,7 @@ varray_prekurtsum_mv(size_t len, const Varray<T> &v, T missval, double mean, dou #pragma omp parallel for simd default(shared) schedule(static) reduction(+ : rsum2diff, rsum4diff, rsum3w) #endif #endif - for (size_t i = 0; i < len; ++i) f_preskewsum_mv(v[i], missval, mean, rsum2diff, rsum4diff, rsum3w); + for (size_t i = 0; i < len; ++i) f_prekurtsum_mv(v[i], missval, mean, rsum2diff, rsum4diff, rsum3w); } else { @@ -1157,7 +1313,7 @@ varray_prekurtsum_mv(size_t len, const Varray<T> &v, T missval, double mean, dou #pragma omp simd reduction(+ : rsum2diff, rsum4diff, rsum3w) #endif #endif - for (size_t i = 0; i < len; ++i) f_preskewsum_mv(v[i], missval, mean, rsum2diff, rsum4diff, rsum3w); + for (size_t i = 0; i < len; ++i) f_prekurtsum_mv(v[i], missval, mean, rsum2diff, rsum4diff, rsum3w); } } -- GitLab