Commit 204e735a authored by Uwe Schulzweida's avatar Uwe Schulzweida
Browse files

cgribexlib update

parent 5180bb0d
......@@ -4,7 +4,7 @@ CDI NEWS
Version 1.5.5 (?? May 2012):
New features:
* added single precision support: streamWriteVarFloat, streamWriteVarSliceFloat
* Added single precision support: streamWriteVarFloat, streamWriteVarSliceFloat
Version 1.5.4 (30 January 2012):
......
/* Automatically generated by m214003 at 2012-02-27, do not edit */
/* Automatically generated by m214003 at 2012-04-02, do not edit */
/* CGRIBEXLIB_VERSION="1.5.2" */
......@@ -12,13 +12,35 @@
#include <stdarg.h>
#include <sys/types.h>
#undef _GET_X86_COUNTER
#undef _GET_IBM_COUNTER
//#undef _ARCH_PWR6
//#undef __AVX__
//#undef __SSE2__
#ifdef _GET_IBM_COUNTER
#include <libhpc.h>
#endif
#ifdef __AVX__
#include <float.h>
#include <stdint.h>
#include <inttypes.h>
#include <immintrin.h>
#ifdef _GET_X86_COUNTER
#include <x86intrin.h>
#endif
#else
#ifdef __SSE2__
#include <float.h>
#include <pmmintrin.h>
#include <stdint.h>
#include <inttypes.h>
#include <pmmintrin.h>
#ifdef _GET_X86_COUNTER
#include <x86intrin.h>
#endif
#endif
#endif
#include "file.h"
#include "dmemory.h"
......@@ -2804,12 +2826,116 @@ void gribPrintSec4Wave(int *isec4)
}
}
#undef _GET_X86_COUNTER
#undef _GET_IBM_COUNTER
//#undef _ARCH_PWR6
//#undef __AVX__
//#undef __SSE2__
#ifdef _GET_IBM_COUNTER
#endif
#ifdef __AVX__
#ifdef _GET_X86_COUNTER
#endif
#else
#ifdef __SSE2__
#ifdef _GET_X86_COUNTER
#endif
#endif
#endif
#ifdef __AVX__
static
void avx_minmaxval(const double *restrict buf, long nframes, double *min, double *max)
{
__m256d current_max, current_min, work;
// load max and min values into all four slots of the XMM registers
current_min = _mm256_set1_pd(*min);
current_max = _mm256_set1_pd(*max);
// Work input until "buf" reaches 32 byte alignment
while ( ((unsigned long)buf) % 32 != 0 && nframes > 0) {
// Load the next double into the work buffer
work = _mm256_set1_pd(*buf);
current_min = _mm256_min_pd(current_min, work);
current_max = _mm256_max_pd(current_max, work);
buf++;
nframes--;
}
while (nframes >= 16) {
// use 64 byte prefetch?
__builtin_prefetch(buf+64,0,0); // for GCC 4.3.2+
work = _mm256_load_pd(buf);
current_min = _mm256_min_pd(current_min, work);
current_max = _mm256_max_pd(current_max, work);
buf += 4;
__builtin_prefetch(buf+64,0,0); // for GCC 4.3.2+
work = _mm256_load_pd(buf);
current_min = _mm256_min_pd(current_min, work);
current_max = _mm256_max_pd(current_max, work);
buf += 4;
__builtin_prefetch(buf+64,0,0); // for GCC 4.3.2+
work = _mm256_load_pd(buf);
current_min = _mm256_min_pd(current_min, work);
current_max = _mm256_max_pd(current_max, work);
buf += 4;
__builtin_prefetch(buf+64,0,0); // for GCC 4.3.2+
work = _mm256_load_pd(buf);
current_min = _mm256_min_pd(current_min, work);
current_max = _mm256_max_pd(current_max, work);
buf += 4;
nframes -= 16;
}
// work through aligned buffers
while (nframes >= 4) {
work = _mm256_load_pd(buf);
current_min = _mm256_min_pd(current_min, work);
current_max = _mm256_max_pd(current_max, work);
buf += 4;
nframes -= 4;
}
// work through the remainung values
while ( nframes > 0) {
work = _mm256_set1_pd(*buf);
current_min = _mm256_min_pd(current_min, work);
current_max = _mm256_max_pd(current_max, work);
buf++;
nframes--;
}
// find min & max value through shuffle tricks
work = current_min;
work = _mm256_shuffle_pd(work, work, _MM_SHUFFLE(2, 3, 0, 1));
work = _mm256_min_pd (work, current_min);
current_min = work;
work = _mm256_shuffle_pd(work, work, _MM_SHUFFLE(1, 0, 3, 2));
work = _mm256_min_pd (work, current_min);
_mm256_store_pd(min, work);
work = current_max;
work = _mm256_shuffle_pd(work, work, _MM_SHUFFLE(2, 3, 0, 1));
work = _mm256_max_pd (work, current_max);
current_max = work;
work = _mm256_shuffle_pd(work, work, _MM_SHUFFLE(1, 0, 3, 2));
work = _mm256_max_pd (work, current_max);
_mm256_store_pd(max, work);
return;
}
#else
#ifdef __SSE2__
static
void sse_minmaxval(const double *restrict buf, long nframes, double *min, double *max)
void sse2_minmaxval(const double *restrict buf, long nframes, double *min, double *max)
{
__m128d current_max, current_min, work;
......@@ -2884,19 +3010,89 @@ void sse_minmaxval(const double *restrict buf, long nframes, double *min, double
}
#endif
#endif
static
void minmaxval(const double *restrict data, long datasize, double *fmin, double *fmax)
{
long i;
#ifdef _GET_X86_COUNTER
uint64_t start_minmax, end_minmax;
#endif
if ( datasize < 1 ) return;
#ifdef _GET_X86_COUNTER
start_minmax = _rdtsc();
#endif
#ifdef __AVX__
avx_minmaxval(data, datasize, fmin, fmax);
#else
#ifdef __SSE2__
sse_minmaxval(data, datasize, fmin, fmax);
sse2_minmaxval(data, datasize, fmin, fmax);
#else
#ifdef _ARCH_PWR6
#define __UNROLL_DEPTH_1 7
// to allow pipelining we have to unroll
#ifdef _GET_IBM_COUNTER
hpmStart(1, "minmax fsel");
#endif
{
long i, j;
long residual = datasize % __UNROLL_DEPTH_1;
long ofs = datasize - residual;
double register dmin[__UNROLL_DEPTH_1];
double register dmax[__UNROLL_DEPTH_1];
for ( j = 0; j < __UNROLL_DEPTH_1; j++)
{
dmin[j] = data[0];
dmax[j] = data[0];
}
for ( i = 0; i < datasize - residual; i += __UNROLL_DEPTH_1 )
{
for (j = 0; j < __UNROLL_DEPTH_1; j++)
{
dmin[j] = __fsel(dmin[j] - data[i+j], data[i+j], dmin[j]);
dmax[j] = __fsel(data[i+j] - dmax[j], data[i+j], dmax[j]);
}
}
for (j = 0; j < residual; j++)
{
dmin[j] = __fsel(dmin[j] - data[ofs+j], data[ofs+j], dmin[j]);
dmax[j] = __fsel(data[ofs+j] - dmax[j], data[ofs+j], dmax[j]);
}
for ( j = 0; j < __UNROLL_DEPTH_1; j++)
{
*fmin = __fsel(*fmin - dmin[j], dmin[j], *fmin);
*fmax = __fsel(dmax[j] - *fmax, dmax[j], *fmax);
}
}
#ifdef _GET_IBM_COUNTER
hpmStop(1);
#endif
#undef __UNROLL_DEPTH_1
#else
#ifdef _GET_IBM_COUNTER
hpmStart(1, "minmax base");
#endif
{
long i;
#if defined (CRAY)
#pragma _CRI ivdep
#elif defined (SX)
......@@ -2904,18 +3100,42 @@ void minmaxval(const double *restrict data, long datasize, double *fmin, double
#elif defined (__uxp__)
#pragma loop novrec
#endif
for ( i = 0; i < datasize; ++i )
{
if ( *fmin > data[i] ) *fmin = data[i];
if ( *fmax < data[i] ) *fmax = data[i];
/*
*fmin = *fmin < data[i] ? *fmin : data[i];
*fmax = *fmax > data[i] ? *fmax : data[i];
*/
}
for ( i = 0; i < datasize; ++i )
{
if ( *fmin > data[i] ) *fmin = data[i];
if ( *fmax < data[i] ) *fmax = data[i];
/*
*fmin = *fmin < data[i] ? *fmin : data[i];
*fmax = *fmax > data[i] ? *fmax : data[i];
*/
}
}
#ifdef _GET_IBM_COUNTER
hpmStop(1);
#endif
#endif
#endif
#endif
#ifdef _GET_X86_COUNTER
end_minmax = _rdtsc();
#ifdef __AVX__
printf("AVX cycles:: %" PRIu64 "\n",
end_minmax-start_minmax);
#else
#ifdef __SSE2__
printf("SSE2 cycles:: %" PRIu64 "\n",
end_minmax-start_minmax);
#else
printf("loop cycles:: %" PRIu64 "\n",
end_minmax-start_minmax);
#endif
#endif
#endif
}
return;
}
int BitsPerInt = (int) (sizeof(int) * 8);
......@@ -3513,11 +3733,14 @@ void encode_double_array_common(int numBits, long packStart, long datasize, GRIB
}
static
void encode_double_array(int numBits, long PackStart, long datasize, GRIBPACK *lGrib,
const double *data, double zref, double factor, long *gz)
void encode_double_array(int numBits, long PackStart, long datasize,
GRIBPACK *restrict lGrib,
const double *restrict data,
double zref, double factor, long *restrict gz)
{
long i, z = *gz;
unsigned int ipval;
unsigned long ipval;
double tmp;
if ( numBits == 8 )
{
......@@ -3530,7 +3753,8 @@ void encode_double_array(int numBits, long PackStart, long datasize, GRIBPACK *l
#endif
for ( i = PackStart; i < datasize; i++ )
{
ipval = (unsigned int) ((data[i] - zref) * factor + 0.5);
tmp = ((data[i] - zref) * factor + 0.5);
ipval = (unsigned long) tmp;
/*
if ( ipval > max_nbpv_pow2 ) ipval = max_nbpv_pow2;
if ( ipval < 0 ) ipval = 0;
......@@ -3547,18 +3771,81 @@ void encode_double_array(int numBits, long PackStart, long datasize, GRIBPACK *l
#pragma vdir nodep
#elif defined (__uxp__)
#pragma loop novrec
#endif
#ifdef _ARCH_PWR6
#define __UNROLL_DEPTH_2 8
{
long j;
long residual = datasize % __UNROLL_DEPTH_2;
long ofs = datasize - residual;
double dval[__UNROLL_DEPTH_2];
unsigned long ival[__UNROLL_DEPTH_2];
// reducing FP operations to single FMA is slowing down ...
// double dconst;
// (data - zref)*factor+0.5
// = factor*data - zref*factor + 0.5
// dconst = zref*factor-0.5;
#ifdef _GET_IBM_COUNTER
hpmStart(2, "pack 16bit unrolled");
#endif
#pragma nounrollandfuse
for ( i = 0; i < datasize - residual; i += __UNROLL_DEPTH_2 )
{
for (j = 0; j < __UNROLL_DEPTH_2; j++)
{
dval[j] = ((data[i+j] - zref) * factor + 0.5);
//dval[j] = factor*data[i+j] - dconst;
}
s for (j = 0; j < __UNROLL_DEPTH_2; j++)
{
ival[j] = (unsigned long) dval[j];
// if ( ipval > max_nbpv_pow2 ) ipval = max_nbpv_pow2;
// if ( ipval < 0 ) ipval = 0;
}
for (j = 0; j < __UNROLL_DEPTH_2; j++)
{
lGrib[z ] = ival[j] >> 8;
lGrib[z+1] = ival[j];
z += 2;
}
}
for (j = 0; j < residual; j++)
{
dval[j] = ((data[ofs+j] - zref) * factor + 0.5);
//dval[j] = factor*data[i+j] - dconst;
ival[j] = (unsigned long) dval[j];
lGrib[z ] = ival[j] >> 8;
lGrib[z+1] = ival[j];
z += 2;
}
#ifdef _GET_IBM_COUNTER
hpmStop(2);
#endif
#undef __UNROLL_DEPTH_2
}
#else
#ifdef _GET_IBM_COUNTER
hpmStart(2, "pack 16bit base");
#endif
for ( i = PackStart; i < datasize; i++ )
{
ipval = (unsigned int) ((data[i] - zref) * factor + 0.5);
/*
if ( ipval > max_nbpv_pow2 ) ipval = max_nbpv_pow2;
if ( ipval < 0 ) ipval = 0;
*/
lGrib[z ] = ipval >> 8;
{
tmp = ((data[i] - zref) * factor + 0.5);
ipval = (unsigned long) tmp;
/*
if ( ipval > max_nbpv_pow2 ) ipval = max_nbpv_pow2;
if ( ipval < 0 ) ipval = 0;
*/
lGrib[z ] = ipval >> 8;
lGrib[z+1] = ipval;
z += 2;
}
z += 2;
}
#ifdef _GET_IBM_COUNTER
hpmStop(2);
#endif
#endif
}
else if ( numBits == 24 )
{
......@@ -3571,7 +3858,8 @@ void encode_double_array(int numBits, long PackStart, long datasize, GRIBPACK *l
#endif
for ( i = PackStart; i < datasize; i++ )
{
ipval = (unsigned int) ((data[i] - zref) * factor + 0.5);
tmp = ((data[i] - zref) * factor + 0.5);
ipval = (unsigned long) tmp;
/*
if ( ipval > max_nbpv_pow2 ) ipval = max_nbpv_pow2;
if ( ipval < 0 ) ipval = 0;
......@@ -3593,7 +3881,8 @@ void encode_double_array(int numBits, long PackStart, long datasize, GRIBPACK *l
#endif
for ( i = PackStart; i < datasize; i++ )
{
ipval = (unsigned int) ((data[i] - zref) * factor + 0.5);
tmp = ((data[i] - zref) * factor + 0.5);
ipval = (unsigned long) tmp;
/*
if ( ipval > max_nbpv_pow2 ) ipval = max_nbpv_pow2;
if ( ipval < 0 ) ipval = 0;
......@@ -9270,7 +9559,7 @@ int gribUnzip(unsigned char *dbuf, long dbufsize, unsigned char *sbuf, long sbu
return (gribLen);
}
static const char grb_libvers[] = "1.5.2" " of ""Feb 27 2012"" ""13:52:12";
static const char grb_libvers[] = "1.5.2" " of ""Apr 2 2012"" ""10:03:59";
const char *
cgribexLibraryVersion(void)
{
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment