Skip to content
Snippets Groups Projects
Commit f91a8f01 authored by Uwe Schulzweida's avatar Uwe Schulzweida
Browse files

varray_*mv(): add pure simd version

parent 1d425bbd
No related branches found
No related tags found
1 merge request!147M214003/develop
Pipeline #48552 failed
...@@ -44,7 +44,7 @@ Field::resize(const size_t count) ...@@ -44,7 +44,7 @@ Field::resize(const size_t count)
} }
void void
Field::resize(const size_t count, const double value) Field::resize(const size_t count, double value)
{ {
memType = MemType::Double; memType = MemType::Double;
m_count = count; m_count = count;
...@@ -182,7 +182,7 @@ class valueDblIsEqual ...@@ -182,7 +182,7 @@ class valueDblIsEqual
public: public:
explicit valueDblIsEqual(double missval) : _missval(missval) {} explicit valueDblIsEqual(double missval) : _missval(missval) {}
bool bool
operator()(const double value) const operator()(double value) const
{ {
return dbl_is_equal(value, _missval); return dbl_is_equal(value, _missval);
} }
...@@ -196,7 +196,7 @@ class valueIsEqual ...@@ -196,7 +196,7 @@ class valueIsEqual
public: public:
explicit valueIsEqual(double missval) : _missval(missval) {} explicit valueIsEqual(double missval) : _missval(missval) {}
bool bool
operator()(const double value) const operator()(double value) const
{ {
return is_equal(value, _missval); return is_equal(value, _missval);
} }
...@@ -465,7 +465,7 @@ else ...@@ -465,7 +465,7 @@ else
template <typename T> template <typename T>
double double
array_pctl(size_t len, T *array, size_t nmiss, T missval, const double pn) array_pctl(size_t len, T *array, size_t nmiss, T missval, double pn)
{ {
double pctl = missval; double pctl = missval;
...@@ -492,7 +492,7 @@ array_pctl(size_t len, T *array, size_t nmiss, T missval, const double pn) ...@@ -492,7 +492,7 @@ array_pctl(size_t len, T *array, size_t nmiss, T missval, const double pn)
} }
double double
field_pctl(Field &field, const double pn) field_pctl(Field &field, double pn)
{ {
if (field.memType == MemType::Float) if (field.memType == MemType::Float)
return array_pctl(field.size, field.vec_f.data(), field.nmiss, (float) field.missval, pn); return array_pctl(field.size, field.vec_f.data(), field.nmiss, (float) field.missval, pn);
......
...@@ -84,12 +84,24 @@ varray_min_max_sum(size_t len, const Varray<T> &v, MinMaxSum mms) ...@@ -84,12 +84,24 @@ varray_min_max_sum(size_t len, const Varray<T> &v, MinMaxSum mms)
auto vmax = mms.max; auto vmax = mms.max;
auto vsum = mms.sum; auto vsum = mms.sum;
if (len > cdoMinLoopSize)
{
#ifndef __ICC // wrong result with icc19
#ifdef HAVE_OPENMP4
#pragma omp parallel for simd default(shared) schedule(static) reduction(min : vmin) reduction(max : vmax) reduction(+ : vsum)
#endif
#endif
for (size_t i = 0; i < len; ++i) f_minmaxsum((double) v[i], vmin, vmax, vsum);
}
else
{
#ifndef __ICC // wrong result with icc19 #ifndef __ICC // wrong result with icc19
#ifdef HAVE_OPENMP4 #ifdef HAVE_OPENMP4
#pragma omp parallel for simd if (len > cdoMinLoopSize) default(shared) schedule(static) reduction(min : vmin) reduction(max : vmax) reduction(+ : vsum) #pragma omp simd reduction(min : vmin) reduction(max : vmax) reduction(+ : vsum)
#endif #endif
#endif #endif
for (size_t i = 0; i < len; ++i) f_minmaxsum((double) v[i], vmin, vmax, vsum); for (size_t i = 0; i < len; ++i) f_minmaxsum((double) v[i], vmin, vmax, vsum);
}
return MinMaxSum(vmin, vmax, vsum, len); return MinMaxSum(vmin, vmax, vsum, len);
} }
...@@ -463,21 +475,45 @@ varray_min_mv(size_t len, const Varray<T> &v, T missval) ...@@ -463,21 +475,45 @@ varray_min_mv(size_t len, const Varray<T> &v, T missval)
if (std::isnan(missval)) if (std::isnan(missval))
{ {
if (len > cdoMinLoopSize)
{
#ifndef __ICC // internal error with icc22: lambda not supported
#ifdef HAVE_OPENMP4
#pragma omp parallel for simd default(shared) schedule(static) reduction(min : vmin)
#endif
#endif
for (size_t i = 0; i < len; ++i) f_min_mv(v[i], missval, vmin, dbl_is_equal);
}
else
{
#ifndef __ICC // internal error with icc22: lambda not supported #ifndef __ICC // internal error with icc22: lambda not supported
#ifdef HAVE_OPENMP4 #ifdef HAVE_OPENMP4
#pragma omp parallel for if (len > cdoMinLoopSize) default(shared) schedule(static) reduction(min : vmin) #pragma omp simd reduction(min : vmin)
#endif #endif
#endif #endif
for (size_t i = 0; i < len; ++i) f_min_mv(v[i], missval, vmin, dbl_is_equal); for (size_t i = 0; i < len; ++i) f_min_mv(v[i], missval, vmin, dbl_is_equal);
}
} }
else else
{ {
if (len > cdoMinLoopSize)
{
#ifndef __ICC // internal error with icc22: lambda not supported
#ifdef HAVE_OPENMP4
#pragma omp parallel for simd default(shared) schedule(static) reduction(min : vmin)
#endif
#endif
for (size_t i = 0; i < len; ++i) f_min_mv(v[i], missval, vmin, is_equal);
}
else
{
#ifndef __ICC // internal error with icc22: lambda not supported #ifndef __ICC // internal error with icc22: lambda not supported
#ifdef HAVE_OPENMP4 #ifdef HAVE_OPENMP4
#pragma omp parallel for if (len > cdoMinLoopSize) default(shared) schedule(static) reduction(min : vmin) #pragma omp simd reduction(min : vmin)
#endif #endif
#endif #endif
for (size_t i = 0; i < len; ++i) f_min_mv(v[i], missval, vmin, is_equal); for (size_t i = 0; i < len; ++i) f_min_mv(v[i], missval, vmin, is_equal);
}
} }
if (is_equal(vmin, std::numeric_limits<T>::max())) vmin = missval; if (is_equal(vmin, std::numeric_limits<T>::max())) vmin = missval;
...@@ -505,21 +541,45 @@ varray_max_mv(size_t len, const Varray<T> &v, T missval) ...@@ -505,21 +541,45 @@ varray_max_mv(size_t len, const Varray<T> &v, T missval)
if (std::isnan(missval)) if (std::isnan(missval))
{ {
if (len > cdoMinLoopSize)
{
#ifndef __ICC // internal error with icc22: lambda not supported
#ifdef HAVE_OPENMP4
#pragma omp parallel for simd default(shared) schedule(static) reduction(max : vmax)
#endif
#endif
for (size_t i = 0; i < len; ++i) f_max_mv(v[i], missval, vmax, dbl_is_equal);
}
else
{
#ifndef __ICC // internal error with icc22: lambda not supported #ifndef __ICC // internal error with icc22: lambda not supported
#ifdef HAVE_OPENMP4 #ifdef HAVE_OPENMP4
#pragma omp parallel for if (len > cdoMinLoopSize) default(shared) schedule(static) reduction(max : vmax) #pragma omp simd reduction(max : vmax)
#endif #endif
#endif #endif
for (size_t i = 0; i < len; ++i) f_max_mv(v[i], missval, vmax, dbl_is_equal); for (size_t i = 0; i < len; ++i) f_max_mv(v[i], missval, vmax, dbl_is_equal);
}
} }
else else
{ {
if (len > cdoMinLoopSize)
{
#ifndef __ICC // internal error with icc22: lambda not supported
#ifdef HAVE_OPENMP4
#pragma omp parallel for simd default(shared) schedule(static) reduction(max : vmax)
#endif
#endif
for (size_t i = 0; i < len; ++i) f_max_mv(v[i], missval, vmax, is_equal);
}
else
{
#ifndef __ICC // internal error with icc22: lambda not supported #ifndef __ICC // internal error with icc22: lambda not supported
#ifdef HAVE_OPENMP4 #ifdef HAVE_OPENMP4
#pragma omp parallel for if (len > cdoMinLoopSize) default(shared) schedule(static) reduction(max : vmax) #pragma omp simd reduction(max : vmax)
#endif #endif
#endif #endif
for (size_t i = 0; i < len; ++i) f_max_mv(v[i], missval, vmax, is_equal); for (size_t i = 0; i < len; ++i) f_max_mv(v[i], missval, vmax, is_equal);
}
} }
if (is_equal(vmax, -std::numeric_limits<T>::max())) vmax = missval; if (is_equal(vmax, -std::numeric_limits<T>::max())) vmax = missval;
...@@ -552,21 +612,45 @@ varray_range_mv(size_t len, const Varray<T> &v, T missval) ...@@ -552,21 +612,45 @@ varray_range_mv(size_t len, const Varray<T> &v, T missval)
if (std::isnan(missval)) if (std::isnan(missval))
{ {
if (len > cdoMinLoopSize)
{
#ifndef __ICC // internal error with icc22: lambda not supported
#ifdef HAVE_OPENMP4
#pragma omp parallel for simd default(shared) schedule(static) reduction(min : vmin) reduction(max : vmax)
#endif
#endif
for (size_t i = 0; i < len; ++i) f_minmax_mv(v[i], missval, vmin, vmax, dbl_is_equal);
}
else
{
#ifndef __ICC // internal error with icc22: lambda not supported #ifndef __ICC // internal error with icc22: lambda not supported
#ifdef HAVE_OPENMP4 #ifdef HAVE_OPENMP4
#pragma omp parallel for if (len > cdoMinLoopSize) default(shared) schedule(static) reduction(min : vmin) reduction(max : vmax) #pragma omp simd reduction(min : vmin) reduction(max : vmax)
#endif #endif
#endif #endif
for (size_t i = 0; i < len; ++i) f_minmax_mv(v[i], missval, vmin, vmax, dbl_is_equal); for (size_t i = 0; i < len; ++i) f_minmax_mv(v[i], missval, vmin, vmax, dbl_is_equal);
}
} }
else else
{ {
if (len > cdoMinLoopSize)
{
#ifndef __ICC // internal error with icc22: lambda not supported
#ifdef HAVE_OPENMP4
#pragma omp parallel for simd default(shared) schedule(static) reduction(min : vmin) reduction(max : vmax)
#endif
#endif
for (size_t i = 0; i < len; ++i) f_minmax_mv(v[i], missval, vmin, vmax, is_equal);
}
else
{
#ifndef __ICC // internal error with icc22: lambda not supported #ifndef __ICC // internal error with icc22: lambda not supported
#ifdef HAVE_OPENMP4 #ifdef HAVE_OPENMP4
#pragma omp parallel for if (len > cdoMinLoopSize) default(shared) schedule(static) reduction(min : vmin) reduction(max : vmax) #pragma omp simd reduction(min : vmin) reduction(max : vmax)
#endif #endif
#endif #endif
for (size_t i = 0; i < len; ++i) f_minmax_mv(v[i], missval, vmin, vmax, is_equal); for (size_t i = 0; i < len; ++i) f_minmax_mv(v[i], missval, vmin, vmax, is_equal);
}
} }
return (is_equal(vmin, std::numeric_limits<T>::max()) && is_equal(vmax, -std::numeric_limits<T>::max())) ? missval : vmax - vmin; return (is_equal(vmin, std::numeric_limits<T>::max()) && is_equal(vmax, -std::numeric_limits<T>::max())) ? missval : vmax - vmin;
...@@ -637,21 +721,45 @@ varray_sum_mv(size_t len, const Varray<T> &v, T missval) ...@@ -637,21 +721,45 @@ varray_sum_mv(size_t len, const Varray<T> &v, T missval)
if (std::isnan(missval)) if (std::isnan(missval))
{ {
if (len > cdoMinLoopSize)
{
#ifndef __ICC // internal error with icc22: lambda not supported
#ifdef HAVE_OPENMP4
#pragma omp parallel for simd default(shared) schedule(static) reduction(+ : sum, nvals)
#endif
#endif
for (size_t i = 0; i < len; ++i) f_sum_mv(v[i], missval, sum, nvals, dbl_is_equal);
}
else
{
#ifndef __ICC // internal error with icc22: lambda not supported #ifndef __ICC // internal error with icc22: lambda not supported
#ifdef HAVE_OPENMP4 #ifdef HAVE_OPENMP4
#pragma omp parallel for if (len > cdoMinLoopSize) default(shared) schedule(static) reduction(+ : sum, nvals) #pragma omp simd reduction(+ : sum, nvals)
#endif #endif
#endif #endif
for (size_t i = 0; i < len; ++i) f_sum_mv(v[i], missval, sum, nvals, dbl_is_equal); for (size_t i = 0; i < len; ++i) f_sum_mv(v[i], missval, sum, nvals, dbl_is_equal);
}
} }
else else
{ {
if (len > cdoMinLoopSize)
{
#ifndef __ICC // internal error with icc22: lambda not supported
#ifdef HAVE_OPENMP4
#pragma omp parallel for simd default(shared) schedule(static) reduction(+ : sum, nvals)
#endif
#endif
for (size_t i = 0; i < len; ++i) f_sum_mv(v[i], missval, sum, nvals, is_equal);
}
else
{
#ifndef __ICC // internal error with icc22: lambda not supported #ifndef __ICC // internal error with icc22: lambda not supported
#ifdef HAVE_OPENMP4 #ifdef HAVE_OPENMP4
#pragma omp parallel for if (len > cdoMinLoopSize) default(shared) schedule(static) reduction(+ : sum, nvals) #pragma omp simd reduction(+ : sum, nvals)
#endif #endif
#endif #endif
for (size_t i = 0; i < len; ++i) f_sum_mv(v[i], missval, sum, nvals, is_equal); for (size_t i = 0; i < len; ++i) f_sum_mv(v[i], missval, sum, nvals, is_equal);
}
} }
if (!nvals) sum = missval; if (!nvals) sum = missval;
...@@ -771,23 +879,47 @@ varray_weighted_mean_mv(size_t len, const Varray<T> &v, const Varray<double> &w, ...@@ -771,23 +879,47 @@ varray_weighted_mean_mv(size_t len, const Varray<T> &v, const Varray<double> &w,
if (std::isnan(missval)) if (std::isnan(missval))
{ {
auto is_EQ = dbl_is_equal; auto is_EQ = dbl_is_equal;
if (len > cdoMinLoopSize)
{
#ifndef __ICC // internal error with icc22: lambda not supported
#ifdef HAVE_OPENMP4
#pragma omp parallel for default(shared) schedule(static) reduction(+ : sum, sumw)
#endif
#endif
for (size_t i = 0; i < len; ++i) f_weighted_mean_mv(w[i], v[i], missval1, sum, sumw, is_EQ);
}
else
{
#ifndef __ICC // internal error with icc22: lambda not supported #ifndef __ICC // internal error with icc22: lambda not supported
#ifdef HAVE_OPENMP4 #ifdef HAVE_OPENMP4
#pragma omp parallel for if (len > cdoMinLoopSize) default(shared) schedule(static) reduction(+ : sum, sumw) #pragma omp simd reduction(+ : sum, sumw)
#endif #endif
#endif #endif
for (size_t i = 0; i < len; ++i) f_weighted_mean_mv(w[i], v[i], missval1, sum, sumw, is_EQ); for (size_t i = 0; i < len; ++i) f_weighted_mean_mv(w[i], v[i], missval1, sum, sumw, is_EQ);
}
wmean = DIVM(sum, sumw); wmean = DIVM(sum, sumw);
} }
else else
{ {
auto is_EQ = is_equal; auto is_EQ = is_equal;
if (len > cdoMinLoopSize)
{
#ifndef __ICC // internal error with icc22: lambda not supported #ifndef __ICC // internal error with icc22: lambda not supported
#ifdef HAVE_OPENMP4 #ifdef HAVE_OPENMP4
#pragma omp parallel for if (len > cdoMinLoopSize) default(shared) schedule(static) reduction(+ : sum, sumw) #pragma omp parallel for simd default(shared) schedule(static) reduction(+ : sum, sumw)
#endif
#endif
for (size_t i = 0; i < len; ++i) f_weighted_mean_mv(w[i], v[i], missval1, sum, sumw, is_EQ);
}
else
{
#ifndef __ICC // internal error with icc22: lambda not supported
#ifdef HAVE_OPENMP4
#pragma omp simd reduction(+ : sum, sumw)
#endif #endif
#endif #endif
for (size_t i = 0; i < len; ++i) f_weighted_mean_mv(w[i], v[i], missval1, sum, sumw, is_EQ); for (size_t i = 0; i < len; ++i) f_weighted_mean_mv(w[i], v[i], missval1, sum, sumw, is_EQ);
}
wmean = DIVM(sum, sumw); wmean = DIVM(sum, sumw);
} }
...@@ -886,12 +1018,24 @@ varray_prevarsum0_mv(size_t len, const Varray<T> &v, double missval, double &rsu ...@@ -886,12 +1018,24 @@ varray_prevarsum0_mv(size_t len, const Varray<T> &v, double missval, double &rsu
rsum = rsumw = 0.0; rsum = rsumw = 0.0;
if (len > cdoMinLoopSize)
{
#ifndef __ICC // internal error with icc22: lambda not supported
#ifdef HAVE_OPENMP4
#pragma omp parallel for simd default(shared) schedule(static) reduction(+ : rsum, rsumw)
#endif
#endif
for (size_t i = 0; i < len; ++i) f_prevarsum0_mv(v[i], missval, rsum, rsumw);
}
else
{
#ifndef __ICC // internal error with icc22: lambda not supported #ifndef __ICC // internal error with icc22: lambda not supported
#ifdef HAVE_OPENMP4 #ifdef HAVE_OPENMP4
#pragma omp parallel for if (len > cdoMinLoopSize) default(shared) schedule(static) reduction(+ : rsum, rsumw) #pragma omp simd reduction(+ : rsum, rsumw)
#endif #endif
#endif #endif
for (size_t i = 0; i < len; ++i) f_prevarsum0_mv(v[i], missval, rsum, rsumw); for (size_t i = 0; i < len; ++i) f_prevarsum0_mv(v[i], missval, rsum, rsumw);
}
} }
template <typename T> template <typename T>
...@@ -1044,12 +1188,24 @@ varray_weighted_prevarsum_mv(size_t len, const Varray<T> &v, const Varray<double ...@@ -1044,12 +1188,24 @@ varray_weighted_prevarsum_mv(size_t len, const Varray<T> &v, const Varray<double
rsum = rsumq = rsumw = rsumwq = 0.0; rsum = rsumq = rsumw = rsumwq = 0.0;
if (len > cdoMinLoopSize)
{
#ifndef __ICC // internal error with icc22: lambda not supported #ifndef __ICC // internal error with icc22: lambda not supported
#ifdef HAVE_OPENMP4 #ifdef HAVE_OPENMP4
#pragma omp parallel for if (len > cdoMinLoopSize) default(shared) schedule(static) reduction(+ : rsum, rsumq, rsumw, rsumwq) #pragma omp parallel for simd default(shared) schedule(static) reduction(+ : rsum, rsumq, rsumw, rsumwq)
#endif
#endif #endif
for (size_t i = 0; i < len; ++i) f_weighted_prevarsum_mv(w[i], (double) v[i], missval, rsum, rsumq, rsumw, rsumwq);
}
else
{
#ifndef __ICC // internal error with icc22: lambda not supported
#ifdef HAVE_OPENMP4
#pragma omp simd reduction(+ : rsum, rsumq, rsumw, rsumwq)
#endif #endif
for (size_t i = 0; i < len; ++i) f_weighted_prevarsum_mv(w[i], (double) v[i], missval, rsum, rsumq, rsumw, rsumwq); #endif
for (size_t i = 0; i < len; ++i) f_weighted_prevarsum_mv(w[i], (double) v[i], missval, rsum, rsumq, rsumw, rsumwq);
}
} }
template <typename T> template <typename T>
...@@ -1129,7 +1285,7 @@ template <typename T> ...@@ -1129,7 +1285,7 @@ template <typename T>
static void static void
varray_prekurtsum_mv(size_t len, const Varray<T> &v, T missval, double mean, double &rsum3w, double &rsum2diff, double &rsum4diff) varray_prekurtsum_mv(size_t len, const Varray<T> &v, T missval, double mean, double &rsum3w, double &rsum2diff, double &rsum4diff)
{ {
auto f_preskewsum_mv = [](auto a, auto mv_a, auto meanval, auto &sum2diff, auto &sum4diff, auto &sum3w) { auto f_prekurtsum_mv = [](auto a, auto mv_a, auto meanval, auto &sum2diff, auto &sum4diff, auto &sum3w) {
if (!dbl_is_equal(a, mv_a)) if (!dbl_is_equal(a, mv_a))
{ {
double vdiff = a - meanval; double vdiff = a - meanval;
...@@ -1148,7 +1304,7 @@ varray_prekurtsum_mv(size_t len, const Varray<T> &v, T missval, double mean, dou ...@@ -1148,7 +1304,7 @@ varray_prekurtsum_mv(size_t len, const Varray<T> &v, T missval, double mean, dou
#pragma omp parallel for simd default(shared) schedule(static) reduction(+ : rsum2diff, rsum4diff, rsum3w) #pragma omp parallel for simd default(shared) schedule(static) reduction(+ : rsum2diff, rsum4diff, rsum3w)
#endif #endif
#endif #endif
for (size_t i = 0; i < len; ++i) f_preskewsum_mv(v[i], missval, mean, rsum2diff, rsum4diff, rsum3w); for (size_t i = 0; i < len; ++i) f_prekurtsum_mv(v[i], missval, mean, rsum2diff, rsum4diff, rsum3w);
} }
else else
{ {
...@@ -1157,7 +1313,7 @@ varray_prekurtsum_mv(size_t len, const Varray<T> &v, T missval, double mean, dou ...@@ -1157,7 +1313,7 @@ varray_prekurtsum_mv(size_t len, const Varray<T> &v, T missval, double mean, dou
#pragma omp simd reduction(+ : rsum2diff, rsum4diff, rsum3w) #pragma omp simd reduction(+ : rsum2diff, rsum4diff, rsum3w)
#endif #endif
#endif #endif
for (size_t i = 0; i < len; ++i) f_preskewsum_mv(v[i], missval, mean, rsum2diff, rsum4diff, rsum3w); for (size_t i = 0; i < len; ++i) f_prekurtsum_mv(v[i], missval, mean, rsum2diff, rsum4diff, rsum3w);
} }
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment