Commit 948280e6 authored by Uwe Schulzweida's avatar Uwe Schulzweida
Browse files

cgribexlib update

parent 90d917da
/* Automatically generated by m214003 at 2012-12-04, do not edit */
/* Automatically generated by m214003 at 2012-12-07, do not edit */
/* CGRIBEXLIB_VERSION="1.5.6" */
......@@ -672,9 +672,9 @@ double intpow2(int x)
else
return (pow(2.0, (double) x));
}
//#define _GET_X86_COUNTER
//#undef _GET_X86_COUNTER
//#undef _GET_IBM_COUNTER
//#define _GET_MACH_COUNTER
//#undef _GET_MACH_COUNTER
//#undef _ARCH_PWR6
#if defined(__GNUC__) && (__GNUC__ >= 4)
......@@ -684,16 +684,12 @@ double intpow2(int x)
#define DISABLE_SIMD
#endif
#ifdef _GET_IBM_COUNTER
#endif
#ifdef _GET_X86_COUNTER
#endif
#ifdef _GET_MACH_COUNTER
#if defined _GET_IBM_COUNTER
#elif defined _GET_X86_COUNTER
#elif defined _GET_MACH_COUNTER
#endif
#if defined __AVX__
#elif defined __SSE2__
#endif
//#define DISABLE_SIMD
#ifdef DISABLE_SIMD
#ifndef ENABLE_AVX
......@@ -705,12 +701,18 @@ double intpow2(int x)
#endif
#if defined __AVX__
#elif defined __SSE2__
#endif
#if defined __AVX__
static
void avx_minmax_val(const double *restrict buf, long nframes, double *min, double *max)
void avx_minmax_val(const double *restrict buf, size_t nframes, double *min, double *max)
{
double fmin[4], fmax[4];
__m256d current_max, current_min, work;
// load max and min values into all four slots of the XMM registers
// load max and min values into all four slots of the YMM registers
current_min = _mm256_set1_pd(*min);
current_max = _mm256_set1_pd(*max);
......@@ -726,25 +728,27 @@ void avx_minmax_val(const double *restrict buf, long nframes, double *min, doubl
}
while (nframes >= 16) {
// use 64 byte prefetch?
__builtin_prefetch(buf+64,0,0); // for GCC 4.3.2+
work = _mm256_loadu_pd(buf);
(void) _mm_prefetch(buf+8, _MM_HINT_NTA);
work = _mm256_load_pd(buf);
current_min = _mm256_min_pd(current_min, work);
current_max = _mm256_max_pd(current_max, work);
buf += 4;
__builtin_prefetch(buf+64,0,0); // for GCC 4.3.2+
work = _mm256_loadu_pd(buf);
work = _mm256_load_pd(buf);
current_min = _mm256_min_pd(current_min, work);
current_max = _mm256_max_pd(current_max, work);
buf += 4;
__builtin_prefetch(buf+64,0,0); // for GCC 4.3.2+
work = _mm256_loadu_pd(buf);
(void) _mm_prefetch(buf+8, _MM_HINT_NTA);
work = _mm256_load_pd(buf);
current_min = _mm256_min_pd(current_min, work);
current_max = _mm256_max_pd(current_max, work);
buf += 4;
__builtin_prefetch(buf+64,0,0); // for GCC 4.3.2+
work = _mm256_loadu_pd(buf);
work = _mm256_load_pd(buf);
current_min = _mm256_min_pd(current_min, work);
current_max = _mm256_max_pd(current_max, work);
buf += 4;
......@@ -772,19 +776,24 @@ void avx_minmax_val(const double *restrict buf, long nframes, double *min, doubl
// find min & max value through shuffle tricks
work = current_min;
work = _mm256_shuffle_pd(work, work, _MM_SHUFFLE(2, 3, 0, 1));
work = _mm256_shuffle_pd(work, work, 5);
work = _mm256_min_pd (work, current_min);
current_min = work;
work = _mm256_shuffle_pd(work, work, _MM_SHUFFLE(1, 0, 3, 2));
work = _mm256_permute2f128_pd(work, work, 1);
work = _mm256_min_pd (work, current_min);
_mm256_storeu_pd(min, work);
_mm256_storeu_pd(fmin, work);
work = current_max;
work = _mm256_shuffle_pd(work, work, _MM_SHUFFLE(2, 3, 0, 1));
work = current_max;
work = _mm256_shuffle_pd(work, work, 5);
work = _mm256_max_pd (work, current_max);
current_max = work;
work = _mm256_shuffle_pd(work, work, _MM_SHUFFLE(1, 0, 3, 2));
work = _mm256_permute2f128_pd(work, work, 1);
work = _mm256_max_pd (work, current_max);
_mm256_storeu_pd(max, work);
_mm256_storeu_pd(fmax, work);
*min = fmin[0];
*max = fmax[0];
return;
}
......@@ -792,7 +801,7 @@ void avx_minmax_val(const double *restrict buf, long nframes, double *min, doubl
#elif defined __SSE2__
static
void sse2_minmax_val(const double *restrict buf, long nframes, double *min, double *max)
void sse2_minmax_val(const double *restrict buf, size_t nframes, double *min, double *max)
{
__m128d current_max, current_min, work;
......@@ -813,7 +822,7 @@ void sse2_minmax_val(const double *restrict buf, long nframes, double *min, doub
while (nframes >= 8) {
// use 64 byte prefetch for double octetts
__builtin_prefetch(buf+64,0,0); // for GCC 4.3.2 +
// __builtin_prefetch(buf+64,0,0); // for GCC 4.3.2 +
work = _mm_load_pd(buf);
current_min = _mm_min_pd(current_min, work);
......@@ -869,7 +878,7 @@ void sse2_minmax_val(const double *restrict buf, long nframes, double *min, doub
#endif
static
void minmax_val(const double *restrict data, long datasize, double *fmin, double *fmax)
void minmax_val(const double *restrict data, size_t datasize, double *fmin, double *fmax)
{
#if defined _GET_X86_COUNTER || defined _GET_MACH_COUNTER
uint64_t start_minmax, end_minmax;
......@@ -903,9 +912,9 @@ void minmax_val(const double *restrict data, long datasize, double *fmin, double
hpmStart(1, "minmax fsel");
#endif
{
long i, j;
long residual = datasize % __UNROLL_DEPTH_1;
long ofs = datasize - residual;
size_t i, j;
size_t residual = datasize % __UNROLL_DEPTH_1;
size_t ofs = datasize - residual;
double register dmin[__UNROLL_DEPTH_1];
double register dmax[__UNROLL_DEPTH_1];
......@@ -948,7 +957,7 @@ void minmax_val(const double *restrict data, long datasize, double *fmin, double
hpmStart(1, "minmax base");
#endif
{
long i;
size_t i;
#if defined (CRAY)
#pragma _CRI ivdep
......@@ -984,21 +993,25 @@ void minmax_val(const double *restrict data, long datasize, double *fmin, double
#if defined __AVX__
printf("AVX minmax cycles:: %" PRIu64 "\n",
end_minmax-start_minmax);
fprintf (stderr, "AVX min: %lf max: %lf\n", *fmin, *fmax);
#elif defined __SSE2__
printf("SSE2 minmax cycles:: %" PRIu64 "\n",
end_minmax-start_minmax);
fprintf (stderr, "SSE2 min: %lf max: %lf\n", *fmin, *fmax);
#else
printf("loop minmax cycles:: %" PRIu64 "\n",
end_minmax-start_minmax);
fprintf (stderr, "loop min: %lf max: %lf\n", *fmin, *fmax);
#endif
#endif
return;
}
//#define _GET_X86_COUNTER
//#define _GET_MACH_COUNTER
//#undef _GET_X86_COUNTER
//#undef _GET_MACH_COUNTER
//#undef _GET_IBM_COUNTER
//#undef _ARCH_PWR6
#if defined(__GNUC__) && (__GNUC__ >= 4)
#elif defined(__ICC) && (__ICC >= 1100)
......@@ -1012,8 +1025,7 @@ void minmax_val(const double *restrict data, long datasize, double *fmin, double
#elif defined _GET_MACH_COUNTER
#endif
#define DISABLE_SIMD
//#define ENABLE_SSE4_1
//#define DISABLE_SIMD
#ifdef DISABLE_SIMD
#ifndef ENABLE_AVX
......@@ -1028,19 +1040,17 @@ void minmax_val(const double *restrict data, long datasize, double *fmin, double
#elif defined __SSE4_1__
#endif
#if defined __AVX__
static
void avx_encode_double_array_2byte(long datasize,
void avx_encode_double_array_2byte(size_t datasize,
unsigned char * restrict lGrib,
const double * restrict data,
double zref, double factor, long *gz)
double zref, double factor, size_t *gz)
{
long i, j;
unsigned int ival;
size_t i, j;
const double *dval = data;
__m128i *sgrib = (__m128i *) lGrib;
__m128i *sgrib = (__m128i *) (lGrib+(*gz));
const __m128i swap = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
......@@ -1114,14 +1124,17 @@ void avx_encode_double_array_2byte(long datasize,
}
if (i != datasize)
i -= 16;
for ( j = i; j < datasize; j++ )
{
ival = (unsigned int) ((dval[j] - zref) * factor + 0.5);
lGrib[2*j ] = ival >> 8;
lGrib[2*j+1] = ival;
}
{
uint16_t ui16;
i -= 16;
for ( j = i; j < datasize; j++ )
{
ui16 = (uint16_t) ((data[j] - zref) * factor + 0.5);
lGrib[*gz+2*j ] = ui16 >> 8;
lGrib[*gz+2*j+1] = ui16;
}
}
*gz += 2*datasize;
return;
......@@ -1130,15 +1143,14 @@ void avx_encode_double_array_2byte(long datasize,
#elif defined __SSE4_1__
static
void sse41_encode_double_array_2byte(long datasize,
void sse41_encode_double_array_2byte(size_t datasize,
unsigned char * restrict lGrib,
const double * restrict data,
double zref, double factor, long *gz)
double zref, double factor, size_t *gz)
{
long i, j;
unsigned int ival;
size_t i, j;
const double *dval = data;
__m128i *sgrib = (__m128i *) lGrib;
__m128i *sgrib = (__m128i *) (lGrib+(*gz));
const __m128i swap = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
......@@ -1150,8 +1162,6 @@ void sse41_encode_double_array_2byte(long datasize,
__m128i i0, i1, i2, i3, i4;
__m128i s0, s1;
// fprintf (stderr,"z entry: %ld ", *gz);
for (i = 0; i < datasize; i += 16)
{
(void) _mm_prefetch(dval+8, _MM_HINT_NTA);
......@@ -1170,7 +1180,7 @@ void sse41_encode_double_array_2byte(long datasize,
i0 = _mm_cvttpd_epi32 (d0);
i4 = _mm_cvttpd_epi32 (d4);
i0 = _mm_unpacklo_epi64 (i0, i4);
//_____________________________________________________________________________
d1 = _mm_loadu_pd (dval+4);
......@@ -1191,7 +1201,7 @@ void sse41_encode_double_array_2byte(long datasize,
s0 = _mm_packus_epi32(i0, i1);
s0 = _mm_shuffle_epi8 (s0, swap);
//(void) _mm_storeu_si128 (sgrib, s0);
(void) _mm_storeu_si128 (sgrib, s0);
//_____________________________________________________________________________
......@@ -1212,7 +1222,7 @@ void sse41_encode_double_array_2byte(long datasize,
i2 = _mm_cvttpd_epi32 (d2);
i4 = _mm_cvttpd_epi32 (d4);
i2 = _mm_unpacklo_epi64 (i2, i4);
//_____________________________________________________________________________
d3 = _mm_loadu_pd (dval+12);
......@@ -1233,7 +1243,7 @@ void sse41_encode_double_array_2byte(long datasize,
s1 = _mm_packus_epi32(i2, i3);
s1 = _mm_shuffle_epi8 (s1, swap);
//(void) _mm_storeu_si128 (sgrib+1, s1);
(void) _mm_storeu_si128 (sgrib+1, s1);
//_____________________________________________________________________________
......@@ -1241,32 +1251,30 @@ void sse41_encode_double_array_2byte(long datasize,
sgrib += 2;
}
if (i != datasize) {
i -= 16;
for ( j = i; j < datasize; j++ )
{
ival = (unsigned int) ((data[j] - zref) * factor + 0.5);
lGrib[2*j ] = ival >> 8;
lGrib[2*j+1] = ival;
}
}
if (i != datasize)
{
uint16_t ui16;
i -= 16;
for ( j = i; j < datasize; j++ )
{
ui16 = (uint16_t) ((data[j] - zref) * factor + 0.5);
lGrib[*gz+2*j ] = ui16 >> 8;
lGrib[*gz+2*j+1] = ui16;
}
}
*gz += 2*datasize;
fprintf (stderr,"z exit: %ld val 1: %d val z: %d\n",
*gz, 256*lGrib[0]+lGrib[1], 256*lGrib[datasize-2]+lGrib[datasize-1]);
return;
}
#endif // SSE4.1
#endif // SIMD variants
static
void encode_double_array_common(int numBits, long packStart, long datasize, GRIBPACK *lGrib,
const double *data, double zref, double factor, long *gz)
void encode_double_array_common(int numBits, size_t packStart, size_t datasize, GRIBPACK *lGrib,
const double *data, double zref, double factor, size_t *gz)
{
long i, z = *gz;
size_t i, z = *gz;
unsigned int ival;
int cbits, jbits;
unsigned int c;
......@@ -1313,10 +1321,10 @@ void encode_double_array_common(int numBits, long packStart, long datasize, GRIB
}
static
void encode_double_array_byte(int numBits, long packStart, long datasize,
void encode_double_array_byte(int numBits, size_t packStart, size_t datasize,
GRIBPACK *restrict lGrib,
const double *restrict data,
double zref, double factor, long *restrict gz)
double zref, double factor, size_t *gz)
{
#if defined _GET_X86_COUNTER || defined _GET_MACH_COUNTER
uint64_t start_minmax, end_minmax;
......@@ -1324,7 +1332,7 @@ void encode_double_array_byte(int numBits, long packStart, long datasize,
uint16_t ui16;
uint32_t ui32;
long i, z = *gz;
size_t i, z = *gz;
double tmp;
data += packStart;
......@@ -1335,6 +1343,7 @@ void encode_double_array_byte(int numBits, long packStart, long datasize,
#ifdef _GET_IBM_COUNTER
hpmStart(2, "pack 8 bit base");
#endif
#if defined (CRAY)
#pragma _CRI ivdep
#elif defined (SX)
......@@ -1348,6 +1357,7 @@ void encode_double_array_byte(int numBits, long packStart, long datasize,
lGrib[z ] = (uint16_t) tmp;
z++;
}
#ifdef _GET_IBM_COUNTER
hpmStop(2);
#endif
......@@ -1416,6 +1426,7 @@ void encode_double_array_byte(int numBits, long packStart, long datasize,
#ifdef _GET_IBM_COUNTER
hpmStart(4, "pack 24 bit base");
#endif
#if defined (CRAY)
#pragma _CRI ivdep
#elif defined (SX)
......@@ -1432,6 +1443,7 @@ void encode_double_array_byte(int numBits, long packStart, long datasize,
lGrib[z+2] = ui32;
z += 3;
}
#ifdef _GET_IBM_COUNTER
hpmStop(4);
#endif
......@@ -1441,6 +1453,7 @@ void encode_double_array_byte(int numBits, long packStart, long datasize,
#ifdef _GET_IBM_COUNTER
hpmStart(5, "pack 32 bit base");
#endif
#if defined (CRAY)
#pragma _CRI ivdep
#elif defined (SX)
......@@ -1458,6 +1471,7 @@ void encode_double_array_byte(int numBits, long packStart, long datasize,
lGrib[z+3] = ui32;
z += 4;
}
#ifdef _GET_IBM_COUNTER
hpmStop(5);
#endif
......@@ -1480,13 +1494,13 @@ void encode_double_array_byte(int numBits, long packStart, long datasize,
#if defined (_ARCH_PWR6)
static
void encode_double_array_unrolled(int numBits, long packStart, long datasize,
void encode_double_array_unrolled(int numBits, size_t packStart, size_t datasize,
GRIBPACK *restrict lGrib,
const double *restrict data,
double zref, double factor, long *restrict gz)
double zref, double factor, size_t *gz)
{
U_BYTEORDER;
long i, j, z = *gz;
size_t i, j, z = *gz;
#ifdef _ARCH_PWR6
#define __UNROLL_DEPTH_2 8
#else
......@@ -4488,7 +4502,7 @@ int encodeBDS(GRIBPACK *lGrib, long *gribLen, int decscale, int *isec2, int *ise
/* Uwe Schulzweida, 11/04/2003 : Check that number of bits per value is not exceeded */
/* Uwe Schulzweida, 6/05/2003 : Copy result to fpval to prevent integer overflow */
long z = *gribLen;
size_t z = *gribLen;
long i, jloop;
int numBits;
int ival;
......@@ -10174,7 +10188,7 @@ int gribUnzip(unsigned char *dbuf, long dbufsize, unsigned char *sbuf, long sbu
return (gribLen);
}
static const char grb_libvers[] = "1.5.6" " of ""Dec 4 2012"" ""15:18:10";
static const char grb_libvers[] = "1.5.6" " of ""Dec 7 2012"" ""13:14:12";
const char *
cgribexLibraryVersion(void)
{
......
......@@ -2070,7 +2070,6 @@ int gribapiDefSteptype(int editionNumber, grib_handle *gh, int tsteptype, int gc
switch ( tsteptype )
{
case TSTEP_INSTANT: strcpy(stepType, "instant"); proDefTempNum = 0; break;
case TSTEP_AVG: strcpy(stepType, "avg"); proDefTempNum = 8; break;
case TSTEP_ACCUM: strcpy(stepType, "accum"); proDefTempNum = 8; break;
case TSTEP_MAX: strcpy(stepType, "max"); proDefTempNum = 8; break;
......@@ -2080,6 +2079,7 @@ int gribapiDefSteptype(int editionNumber, grib_handle *gh, int tsteptype, int gc
case TSTEP_SD: strcpy(stepType, "sd"); proDefTempNum = 8; break;
case TSTEP_COV: strcpy(stepType, "cov"); proDefTempNum = 8; break;
case TSTEP_RATIO: strcpy(stepType, "ratio"); proDefTempNum = 8; break;
case TSTEP_INSTANT: strcpy(stepType, "instant"); proDefTempNum = 0; break;
default: strcpy(stepType, "instant"); proDefTempNum = 0; break;
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment