Commit 990dfde0 authored by Uwe Schulzweida's avatar Uwe Schulzweida
Browse files

cgribexlib update

parent 08a3c7af
/* Automatically generated by m214003 at 2012-06-08, do not edit */
/* Automatically generated by m214003 at 2012-06-14, do not edit */
/* CGRIBEXLIB_VERSION="1.5.3" */
......@@ -890,7 +890,6 @@ void minmax_val(const double *restrict data, long datasize, double *fmin, double
#else
#ifdef __SSE2__
sse2_minmax_val(data, datasize, fmin, fmax);
......@@ -944,7 +943,8 @@ void minmax_val(const double *restrict data, long datasize, double *fmin, double
#endif
#undef __UNROLL_DEPTH_1
#else
#else // original loop
#ifdef _GET_IBM_COUNTER
hpmStart(1, "minmax base");
......@@ -973,9 +973,9 @@ void minmax_val(const double *restrict data, long datasize, double *fmin, double
hpmStop(1);
#endif
#endif
#endif
#endif
#endif // _ARCH_PWR6 && original loop
#endif // __SSE2__
#endif // __AVX__
#ifdef _GET_X86_COUNTER
end_minmax = _rdtsc();
......@@ -1001,7 +1001,136 @@ void minmax_val(const double *restrict data, long datasize, double *fmin, double
#ifdef _GET_IBM_COUNTER
#endif
#if defined(__GNUC__) && (__GNUC__ >= 4)
#elif defined(__ICC) && (__ICC >= 1100)
#else
#define DISABLE_SIMD
#endif
#ifdef DISABLE_SIMD
#ifndef ENABLE_SSE4_1
#undef __SSE4_1__
#endif
#endif
// SSE4.1
#if 0
#ifdef __SSE4_1__
static
void sse41_encode_double_array_2byte(long datasize,
unsigned char * restrict lGrib,
const double * restrict data,
double zref, double factor, long * restrict gz)
{
long i;
const double *dval = data;
__m128i *sgrib = (__m128i *) lGrib;
const __m128i swap = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
const __m128d c0 = _mm_set1_pd(zref);
const __m128d c1 = _mm_set1_pd(factor);
const __m128d c2 = _mm_set1_pd(0.5);
__m128d d0, d4, d3, d2, d1;
__m128i i0, i1, i2, i3, i4;
__m128i s0, s1;
for (i = 0; i < datasize; i += 16)
{
(void) _mm_prefetch(dval+8, _MM_HINT_NTA);
//_____________________________________________________________________________
d0 = _mm_load_pd (dval);
d0 = _mm_sub_pd (d0, c0);
d0 = _mm_mul_pd (d0, c1);
d0 = _mm_add_pd (d0, c2);
d4 = _mm_load_pd (dval+2);
d4 = _mm_sub_pd (d4, c0);
d4 = _mm_mul_pd (d4, c1);
d4 = _mm_add_pd (d4, c2);
i0 = _mm_cvttpd_epi32 (d0);
i4 = _mm_cvttpd_epi32 (d4);
i0 = _mm_unpacklo_epi64 (i0, i4);
//_____________________________________________________________________________
d1 = _mm_load_pd (dval+4);
d1 = _mm_sub_pd (d1, c0);
d1 = _mm_mul_pd (d1, c1);
d1 = _mm_add_pd (d1, c2);
d4 = _mm_load_pd (dval+6);
d4 = _mm_sub_pd (d4, c0);
d4 = _mm_mul_pd (d4, c1);
d4 = _mm_add_pd (d4, c2);
i1 = _mm_cvttpd_epi32 (d1);
i4 = _mm_cvttpd_epi32 (d4);
i1 = _mm_unpacklo_epi64 (i1, i4);
//_____________________________________________________________________________
s0 = _mm_packus_epi32(i0, i1);
s0 = _mm_shuffle_epi8 (s0, swap);
(void) _mm_stream_si128 (sgrib, s0);
//_____________________________________________________________________________
(void) _mm_prefetch(dval+16, _MM_HINT_NTA);
//_____________________________________________________________________________
d2 = _mm_load_pd (dval+8);
d2 = _mm_sub_pd (d2, c0);
d2 = _mm_mul_pd (d2, c1);
d2 = _mm_add_pd (d2, c2);
d4 = _mm_load_pd (dval+10);
d4 = _mm_sub_pd (d4, c0);
d4 = _mm_mul_pd (d4, c1);
d4 = _mm_add_pd (d4, c2);
i2 = _mm_cvttpd_epi32 (d2);
i4 = _mm_cvttpd_epi32 (d4);
i2 = _mm_unpacklo_epi64 (i2, i4);
//_____________________________________________________________________________
d3 = _mm_load_pd (dval+12);
d3 = _mm_sub_pd (d3, c0);
d3 = _mm_mul_pd (d3, c1);
d3 = _mm_add_pd (d3, c2);
d4 = _mm_load_pd (dval+14);
d4 = _mm_sub_pd (d4, c0);
d4 = _mm_mul_pd (d4, c1);
d4 = _mm_add_pd (d4, c2);
i3 = _mm_cvttpd_epi32 (d3);
i4 = _mm_cvttpd_epi32 (d4);
i3 = _mm_unpacklo_epi64 (i3, i4);
//_____________________________________________________________________________
s1 = _mm_packus_epi32(i2, i3);
s1 = _mm_shuffle_epi8 (s1, swap);
(void) _mm_stream_si128 (sgrib+1, s1);
//_____________________________________________________________________________
dval += 16;
sgrib += 2;
}
return;
}
#endif // SSE4.1
#endif
static
void encode_double_array_common(int numBits, long packStart, long datasize, GRIBPACK *lGrib,
const double *data, double zref, double factor, long *gz)
......@@ -1093,6 +1222,13 @@ void encode_double_array_byte(int numBits, long packStart, long datasize,
#ifdef _GET_IBM_COUNTER
hpmStart(3, "pack 16 bit base");
#endif
#if 0
sse41_encode_double_array_2byte(datasize, lGrib, data, zref, factor, &z);
#else
#if defined (CRAY)
#pragma _CRI ivdep
#elif defined (SX)
......@@ -1108,6 +1244,8 @@ void encode_double_array_byte(int numBits, long packStart, long datasize,
lGrib[z+1] = ival;
z += 2;
}
#endif
#ifdef _GET_IBM_COUNTER
hpmStop(3);
#endif
......@@ -9842,7 +9980,7 @@ int gribUnzip(unsigned char *dbuf, long dbufsize, unsigned char *sbuf, long sbu
return (gribLen);
}
static const char grb_libvers[] = "1.5.3" " of ""Jun 8 2012"" ""10:20:01";
static const char grb_libvers[] = "1.5.3" " of ""Jun 14 2012"" ""13:22:45";
const char *
cgribexLibraryVersion(void)
{
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment