Commit e0885289 authored by Uwe Schulzweida's avatar Uwe Schulzweida
Browse files

cgribexlib update

parent 86974e7c
/* Automatically generated by m214003 at 2013-01-14, do not edit */
/* Automatically generated by m214003 at 2013-01-29, do not edit */
/* CGRIBEXLIB_VERSION="1.6.0" */
......@@ -1698,13 +1698,222 @@ void encode_double_array_unrolled(int numBits, size_t packStart, size_t datasize
*gz = z;
#undef __UNROLL_DEPTH_2
}
//#undef _GET_X86_COUNTER
//#undef _GET_MACH_COUNTER
//#undef _GET_IBM_COUNTER
//#undef _ARCH_PWR6
#if defined _GET_IBM_COUNTER
#include <libhpc.h>
#elif defined _GET_X86_COUNTER
#include <x86intrin.h>
#elif defined _GET_MACH_COUNTER
#include <mach/mach_time.h>
#endif
#define __STDC_FORMAT_MACROS
#include <inttypes.h>
#if ((__GNUC__ >= 4) || (__ICC >= 1100) || defined (__clang__))
#define _ENABLE_SIMD
#include <immintrin.h>
#undef __AVX
#ifdef __AVX__
#define _ENABLE_AVX
#elif __SSE4_1__
#define _ENABLE_SSE4_1
#endif
#endif
#undef _ENABLE_AVX
#undef _ENABLE_SSE4_1
#if defined _ENABLE_AVX
static
void avx_decode_double_array_2byte(size_t datasize, const unsigned char * restrict igrib,
double * restrict fpdata, double fmin, double zscale)
{
size_t i, j;
size_t nframes = datasize;
size_t residual;
size_t ofs;
double dval;
double *data = fpdata;
__m128i *sgrib;
__m128i mask = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
__m256d ymm0 = _mm256_set1_pd(fmin);
__m256d ymm1 = _mm256_set1_pd(zscale);
__m128i xmm0, xmm1, xmm2, xmm3;
__m256d ymm2, ymm3;
i = -1;
while ( ((unsigned long) data) % 32 != 0 && datasize > 0)
{
i++;
dval = (((int)igrib[2*i] << 8) | (int)igrib[2*i+1]);
fpdata[i] = fmin + zscale * dval;
data++;
nframes--;
}
if (i == -1) i = 0;
sgrib = (__m128i *) (igrib+i);
while (nframes >= 16)
{
xmm0 = _mm_loadu_si128((__m128i *) sgrib);
xmm0 = _mm_shuffle_epi8(xmm0, mask);
xmm1 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(1, 0, 3, 2));
xmm2 = _mm_cvtepu16_epi32(xmm0);
xmm3 = _mm_cvtepu16_epi32(xmm1);
ymm2 = _mm256_cvtepi32_pd(xmm2);
ymm2 = _mm256_add_pd(_mm256_mul_pd(ymm2, ymm1), ymm0);
(void) _mm256_stream_pd(data, ymm2);
ymm3 = _mm256_cvtepi32_pd(xmm3);
ymm3 = _mm256_add_pd(_mm256_mul_pd(ymm3, ymm1), ymm0);
(void) _mm256_stream_pd(data+4, ymm3);
xmm0 = _mm_loadu_si128((__m128i *) sgrib + 1);
xmm0 = _mm_shuffle_epi8(xmm0, mask);
xmm1 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(1, 0, 3, 2));
xmm2 = _mm_cvtepu16_epi32(xmm0);
xmm3 = _mm_cvtepu16_epi32(xmm1);
ymm2 = _mm256_cvtepi32_pd(xmm2);
ymm2 = _mm256_add_pd(_mm256_mul_pd(ymm2, ymm1), ymm0);
(void) _mm256_stream_pd(data+8, ymm2);
ymm3 = _mm256_cvtepi32_pd(xmm3);
ymm3 = _mm256_add_pd(_mm256_mul_pd(ymm3, ymm1), ymm0);
(void) _mm256_stream_pd(data+12, ymm3);
data += 16;
sgrib += 2;
nframes -= 16;
}
residual = nframes;
ofs = datasize - residual;
for ( j = 0; j < residual; j++ )
{
dval = (((int)igrib[2*(ofs+j)] << 8) | (int)igrib[2*(ofs+j)+1]);
fpdata[ofs+j] = fmin + zscale * dval;
}
return;
}
#elif defined _ENABLE_SSE4_1
static
void sse41_decode_double_array_2byte(size_t datasize, const unsigned char * restrict igrib,
double * restrict fpdata, double fmin, double zscale)
{
size_t i, j;
size_t nframes = datasize;
size_t residual;
size_t ofs;
double dval;
double *data = fpdata;
__m128i *sgrib;
__m128i mask = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
__m128d dmm8 = _mm_set1_pd(fmin);
__m128d dmm9 = _mm_set1_pd(zscale);
__m128i xmm4, xmm5;
__m128i xmm6, xmm7;
__m128d dmm0, dmm1, dmm2, dmm3;
__m128d dmm4, dmm5, dmm6, dmm7;
i = -1;
while ( ((unsigned long) data) % 32 != 0 && datasize > 0)
{
i++;
dval = (((int)igrib[2*i] << 8) | (int)igrib[2*i+1]);
fpdata[i] = fmin + zscale * dval;
data++;
nframes--;
}
if (i == -1) i = 0;
sgrib = (__m128i *) (igrib+i);
while (nframes >= 16)
{
xmm5 = _mm_loadu_si128((__m128i *) sgrib);
xmm5 = _mm_shuffle_epi8(xmm5, mask);
xmm4 = _mm_srli_si128(xmm5, 8);
xmm6 = _mm_cvtepu16_epi32(xmm5);
dmm0 = _mm_cvtepi32_pd(xmm6);
dmm0 = _mm_add_pd(_mm_mul_pd(dmm0, dmm9), dmm8);
(void) _mm_stream_pd(data, dmm0);
xmm7 = _mm_srli_si128(xmm6, 8);
dmm1 = _mm_cvtepi32_pd(xmm7);
dmm1 = _mm_add_pd(_mm_mul_pd(dmm1, dmm9), dmm8);
(void) _mm_stream_pd(data+2, dmm1);
xmm6 = _mm_cvtepu16_epi32(xmm4);
dmm2 = _mm_cvtepi32_pd(xmm6);
dmm2 = _mm_add_pd(_mm_mul_pd(dmm2, dmm9), dmm8);
(void) _mm_stream_pd(data+4, dmm2);
xmm7 = _mm_srli_si128(xmm6, 8);
dmm3 = _mm_cvtepi32_pd(xmm7);
dmm3 = _mm_add_pd(_mm_mul_pd(dmm3, dmm9), dmm8);
(void) _mm_stream_pd(data+6, dmm3);
xmm5 = _mm_loadu_si128((__m128i *) sgrib+1);
xmm5 = _mm_shuffle_epi8(xmm5, mask);
xmm4 = _mm_srli_si128(xmm5, 8);
xmm6 = _mm_cvtepu16_epi32(xmm5);
dmm4 = _mm_cvtepi32_pd(xmm6);
dmm4 = _mm_add_pd(_mm_mul_pd(dmm4, dmm9), dmm8);
(void) _mm_stream_pd(data+8, dmm4);
xmm7 = _mm_srli_si128(xmm6, 8);
dmm5 = _mm_cvtepi32_pd(xmm7);
dmm5 = _mm_add_pd(_mm_mul_pd(dmm5, dmm9), dmm8);
(void) _mm_stream_pd(data+10, dmm5);
xmm6 = _mm_cvtepu16_epi32(xmm4);
dmm6 = _mm_cvtepi32_pd(xmm6);
dmm6 = _mm_add_pd(_mm_mul_pd(dmm6, dmm9), dmm8);
(void) _mm_stream_pd(data+12, dmm6);
xmm7 = _mm_srli_si128(xmm6, 8);
dmm7 = _mm_cvtepi32_pd(xmm7);
dmm7 = _mm_add_pd(_mm_mul_pd(dmm7, dmm9), dmm8);
(void) _mm_stream_pd(data+14, dmm7);
data += 16;
sgrib += 2;
nframes -= 16;
}
residual = nframes;
ofs = datasize - residual;
for ( j = 0; j < residual; j++ )
{
dval = (((int)igrib[2*(ofs+j)] << 8) | (int)igrib[2*(ofs+j)+1]);
fpdata[ofs+j] = fmin + zscale * dval;
}
return;
}
#endif
static
void decode_double_array_common(unsigned char *igrib, long jlend, int NumBits,
double fmin, double zscale, double *fpdata)
void decode_double_array_common(const unsigned char * restrict igrib, long jlend, int NumBits,
double fmin, double zscale, double * restrict fpdata)
{
/* code from wgrib routine BDS_unpack */
unsigned char *bits = igrib;
const unsigned char *bits = igrib;
unsigned int jmask;
long i;
unsigned int tbits = 0;
......@@ -1738,11 +1947,11 @@ static unsigned int mask[] = {0,1,3,7,15,31,63,127,255};
static double shift[9] = {1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0};
static
void decode_double_array_common2(unsigned char *igrib, long jlend, int NumBits,
double fmin, double zscale, double *fpdata)
void decode_double_array_common2(const unsigned char * restrict igrib, long jlend, int NumBits,
double fmin, double zscale, double * restrict fpdata)
{
/* code from wgrib routine BDS_unpack */
unsigned char *bits = igrib;
const unsigned char *bits = igrib;
long i;
int n_bits = NumBits;
int c_bits, j_bits;
......@@ -1781,9 +1990,13 @@ void decode_double_array_common2(unsigned char *igrib, long jlend, int NumBits,
}
static
void decode_double_array_byte(unsigned char *igrib, long jlend, int numBits,
double fmin, double zscale, double *fpdata)
void decode_double_array_byte(const unsigned char * restrict igrib, long jlend, int numBits,
double fmin, double zscale, double * restrict fpdata)
{
#if defined _GET_X86_COUNTER || defined _GET_MACH_COUNTER
uint64_t start_decode, end_decode;
#endif
long i;
double dval;
#if defined (VECTORCODE)
......@@ -1860,11 +2073,51 @@ void decode_double_array_byte(unsigned char *igrib, long jlend, int numBits,
fpdata[i] = fmin + zscale * dval;
}
else if ( numBits == 16 )
for ( i = 0; i < jlend; i++ )
{
dval = (((int)igrib[2*i ] << 8) | (int)igrib[2*i+1]);
fpdata[i] = fmin + zscale * dval;
}
{
#ifdef _GET_IBM_COUNTER
hpmStart(6, "unpack 16 bit base");
#elif defined _GET_X86_COUNTER
start_decode = _rdtsc();
#elif defined _GET_MACH_COUNTER
start_decode = mach_absolute_time();
#endif
#if defined _ENABLE_AVX
printf("AVX selected ...\n");
avx_decode_double_array_2byte((size_t) jlend, igrib, fpdata, fmin, zscale);
#elif defined _ENABLE_SSE4_1
printf("SSE4 selected ...\n");
sse41_decode_double_array_2byte((size_t) jlend, igrib, fpdata, fmin, zscale);
#else
for ( i = 0; i < jlend; i++ )
{
dval = (((int)igrib[2*i ] << 8) | (int)igrib[2*i+1]);
fpdata[i] = fmin + zscale * dval;
}
#endif
#if defined _GET_X86_COUNTER || defined _GET_MACH_COUNTER
#if defined _GET_X86_COUNTER
end_decode = _rdtsc();
#elif defined _GET_MACH_COUNTER
end_decode = mach_absolute_time();
#endif
#if defined _ENABLE_AVX
printf("AVX encoding cycles:: %" PRIu64 "\n",
end_decode-start_decode);
#elif defined _ENABLE_SSE4_1
printf("SSE 4.1 encoding cycles:: %" PRIu64 "\n",
end_decode-start_decode);
#else
printf("loop encoding cycles:: %" PRIu64 "\n",
end_decode-start_decode);
#endif
#endif
#ifdef _GET_IBM_COUNTER
hpmStop(6);
#endif
}
else if ( numBits == 24 )
for ( i = 0; i < jlend; i++ )
{
......@@ -1895,8 +2148,8 @@ void decode_double_array_byte(unsigned char *igrib, long jlend, int numBits,
}
static
void decode_double_array_unrolled(unsigned char *igrib, long jlend, int numBits,
double fmin, double zscale, double *fpdata)
void decode_double_array_unrolled(const unsigned char * restrict igrib, long jlend, int numBits,
double fmin, double zscale, double * restrict fpdata)
{
decode_double_array_byte(igrib, jlend, numBits, fmin, zscale, fpdata);
}
......@@ -2278,6 +2531,7 @@ LABEL900:
return (pval);
} /* decfp2 */
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <stdarg.h>
......@@ -2356,8 +2610,9 @@ void gribDateTime(int *isec1, int *date, int *time)
{
static int lprint = TRUE;
int ryear, rmonth, rday, rhour, rminute, second;
int time_period = 0;
int julday, secofday, addsec;
int julday, secofday;
int64_t addsec = 0;
int64_t time_period = 0;
int century;
extern int grib_calendar;
......@@ -9629,16 +9884,24 @@ void gribRepair1(int nrec, long recpos, long recsize, unsigned char *gribbuffer)
# include "config.h"
#endif
#if defined (HAVE_LIBSZ)
#if defined (HAVE_LIBSZ) || defined (HAVE_LIBAEC)
#if defined(__cplusplus)
extern "C" {
#endif
#if defined (HAVE_LIBAEC)
# include <libaec.h>
#else
# include <szlib.h>
#endif
#if defined (__cplusplus)
}
#endif
#if defined (HAVE_LIBAEC)
# define AEC_FLAGS (AEC_DATA_MSB | AEC_DATA_PREPROCESS)
#else
# define OPTIONS_MASK (SZ_RAW_OPTION_MASK | SZ_MSB_OPTION_MASK | SZ_NN_OPTION_MASK)
#endif
# define PIXELS_PER_BLOCK (8)
# define PIXELS_PER_SCANLINE (PIXELS_PER_BLOCK*128)
......@@ -9698,7 +9961,7 @@ int gribZip(unsigned char *dbuf, long dbufsize, unsigned char *sbuf, long sbufs
int gribLen;
int rec_len;
int llarge = FALSE;
#if ! defined (HAVE_LIBSZ)
#if ! (defined (HAVE_LIBSZ) || defined (HAVE_LIBAEC))
static int libszwarn = 1;
#endif
unsigned char *pds = NULL, *gds = NULL, *bms = NULL, *bds = NULL;
......@@ -9715,7 +9978,7 @@ int gribZip(unsigned char *dbuf, long dbufsize, unsigned char *sbuf, long sbufs
return (rec_len);
}
#if defined (HAVE_LIBSZ)
#if defined (HAVE_LIBSZ) || defined (HAVE_LIBAEC)
{
long i;
......@@ -9723,9 +9986,14 @@ int gribZip(unsigned char *dbuf, long dbufsize, unsigned char *sbuf, long sbufs
int gribLenOld = 0;
int status;
int datstart, datsize;
#if defined (HAVE_LIBAEC)
struct aec_stream strm;
#else
SZ_com_t sz_param; /* szip parameter block */
#endif
unsigned char *dest, *source;
size_t destLen, sourceLen;
int bits_per_sample;
int bds_len, bds_nbits, bds_flag, lspherc, lcomplex,/* lcompress,*/ bds_ubits;
int bds_head = 11;
int bds_ext = 0;
......@@ -9754,15 +10022,23 @@ int gribZip(unsigned char *dbuf, long dbufsize, unsigned char *sbuf, long sbufs
return (rec_len);
}
sz_param.options_mask = OPTIONS_MASK;
if ( bds_nbits == 24 )
sz_param.bits_per_pixel = 8;
bits_per_sample = 8;
else
sz_param.bits_per_pixel = bds_nbits;
bits_per_sample = bds_nbits;
#if defined (HAVE_LIBAEC)
strm.bits_per_sample = bits_per_sample;
strm.block_size = PIXELS_PER_BLOCK;
strm.rsi = PIXELS_PER_SCANLINE / PIXELS_PER_BLOCK;
strm.flags = AEC_FLAGS;
#else
sz_param.options_mask = OPTIONS_MASK;
sz_param.bits_per_pixel = bits_per_sample;
sz_param.pixels_per_block = PIXELS_PER_BLOCK;
sz_param.pixels_per_scanline = PIXELS_PER_SCANLINE;
#endif
if ( lspherc )
{
......@@ -9807,6 +10083,18 @@ int gribZip(unsigned char *dbuf, long dbufsize, unsigned char *sbuf, long sbufs
source = pbuf;
}
#if defined (HAVE_LIBAEC)
strm.next_in = source;
strm.avail_in = sourceLen;
strm.next_out = dest;
strm.avail_out = destLen;
status = aec_buffer_encode(&strm);
if (status != AEC_OK)
Warning("AEC ERROR: %d code %3d level %3d", status, PDS_Parameter, PDS_Level2);
destLen = strm.total_out;
#else
status = SZ_BufftoBuffCompress(dest, &destLen, source, sourceLen, &sz_param);
if ( status != SZ_OK )
{
......@@ -9821,6 +10109,7 @@ int gribZip(unsigned char *dbuf, long dbufsize, unsigned char *sbuf, long sbufs
else
Warning("SZ ERROR: %d code %3d level %3d", status, PDS_Parameter, PDS_Level2);
}
#endif
if ( pbuf ) free(pbuf);
/*
......@@ -9958,7 +10247,7 @@ int gribZip(unsigned char *dbuf, long dbufsize, unsigned char *sbuf, long sbufs
#else
if ( libszwarn )
{
Warning("Compression disabled, szlib not available!");
Warning("Compression disabled, szlib or libaec not available!");
libszwarn = 0;
}
#endif
......@@ -9973,13 +10262,13 @@ int gribZip(unsigned char *dbuf, long dbufsize, unsigned char *sbuf, long sbufs
int gribUnzip(unsigned char *dbuf, long dbufsize, unsigned char *sbuf, long sbufsize)
{
#if ! defined (HAVE_LIBSZ)
#if ! (defined (HAVE_LIBSZ) || defined (HAVE_LIBAEC))
static int libszwarn = 1;
#endif
int nerr;
unsigned char *pds = NULL, *gds = NULL, *bms = NULL, *bds = NULL;
int bdsLen, recLen, gribLen = 0;
char *dest, *source;
unsigned char *dest, *source;
size_t destLen, sourceLen;
int /* bds_len, */ bds_nbits, bds_flag, lspherc, lcomplex /*, lcompress*/;
int bds_head = 11;
......@@ -10024,7 +10313,7 @@ int gribUnzip(unsigned char *dbuf, long dbufsize, unsigned char *sbuf, long sbu
datstart = bds_head + bds_ext;
source = (char *) bds + datstart + bds_zoffset;
source = bds + datstart + bds_zoffset;
if ( llarge )
sourceLen = ((size_t) ((bds[21]<<24)+(bds[22]<<16)+(bds[23]<<8)+bds[24]));
else
......@@ -10037,7 +10326,7 @@ int gribUnzip(unsigned char *dbuf, long dbufsize, unsigned char *sbuf, long sbu
return (0);
}
dest = (char *) bds + datstart;
dest = bds + datstart;
if ( llarge )
destLen = ((size_t) ((bds[17]<<24)+(bds[18]<<16)+(bds[19]<<8)+bds[20]));
else
......@@ -10047,23 +10336,35 @@ int gribUnzip(unsigned char *dbuf, long dbufsize, unsigned char *sbuf, long sbu
bdsLen = datstart + destLen;
#if defined (HAVE_LIBSZ)
#if defined (HAVE_LIBSZ) || defined (HAVE_LIBAEC)
{
int status;
long i;
size_t tmpLen;
int bds_ubits;
int bits_per_sample;
#if defined (HAVE_LIBAEC)
struct aec_stream strm;
#else
SZ_com_t sz_param; /* szip parameter block */
sz_param.options_mask = OPTIONS_MASK;
#endif
if ( bds_nbits == 24 )
sz_param.bits_per_pixel = 8;
bits_per_sample = 8;
else
sz_param.bits_per_pixel = bds_nbits;
bits_per_sample = bds_nbits;
#if defined (HAVE_LIBAEC)
strm.bits_per_sample = bits_per_sample;
strm.block_size = PIXELS_PER_BLOCK;
strm.rsi = PIXELS_PER_SCANLINE / PIXELS_PER_BLOCK;
strm.flags = AEC_FLAGS;
#else
sz_param.options_mask = OPTIONS_MASK;
sz_param.bits_per_pixel = bits_per_sample;
sz_param.pixels_per_block = PIXELS_PER_BLOCK;
sz_param.pixels_per_scanline = PIXELS_PER_SCANLINE;
#endif
if ( bds_ext )
for ( i = 0; i < bds_ext; ++i )
......@@ -10076,6 +10377,18 @@ int gribUnzip(unsigned char *dbuf, long dbufsize, unsigned char *sbuf, long sbu
*/
tmpLen = destLen;
#if defined (HAVE_LIBAEC)
strm.next_in = source;
strm.avail_in = sourceLen;
strm.next_out = dest;
strm.avail_out = tmpLen;
status = aec_buffer_decode(&strm);
if (status != AEC_OK)
Warning("AEC ERROR: %d code %3d level %3d", status, PDS_Parameter, PDS_Level2);
tmpLen = strm.total_out;
#else
status = SZ_BufftoBuffDecompress(dest, &tmpLen, source, sourceLen, &sz_param);
if ( status != SZ_OK )
{
......@@ -10090,6 +10403,7 @@ int gribUnzip(unsigned char *dbuf, long dbufsize, unsigned char *sbuf, long sbu
else
Warning("SZ ERROR: %d code %3d level %3d", status, PDS_Parameter, PDS_Level2);
}
#endif
/*
fprintf(stderr, "gribUnzip: sl = %ld dl = %ld tl = %ld\n",
(long)sourceLen, (long)destLen,(long) tmpLen);
......@@ -10188,7 +10502,7 @@ int gribUnzip(unsigned char *dbuf, long dbufsize, unsigned char *sbuf, long sbu
return (gribLen);
}
static const char grb_libvers[] = "1.6.0" " of ""Jan 14 2013"" ""13:42:16";
static const char grb_libvers[] = "1.6.0" " of ""Jan 29 2013"" ""10:43:15";
const char *
cgribexLibraryVersion(void)
{
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment