Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
mpim-sw
libcdi
Commits
1b4b6b90
Commit
1b4b6b90
authored
Oct 30, 2015
by
Uwe Schulzweida
Browse files
cgribexlib update
parent
997c8c8c
Changes
3
Hide whitespace changes
Inline
Side-by-side
ChangeLog
View file @
1b4b6b90
2016-??-?? Uwe Schulzweida
* Version 1.7.1 released
* using CGRIBEX library version 1.7.4
2015-10-27 Uwe Schulzweida
* Version 1.7.0 released
...
...
src/binary.c
View file @
1b4b6b90
...
...
@@ -16,7 +16,7 @@ UINT32 get_UINT32(unsigned char *x)
switch
(
HOST_ENDIANNESS
)
{
case
CDI_BIGENDIAN
:
return
((
UINT32
)(((
UINT32
)
x
[
0
]
<<
24
)
+
((
UINT32
)
x
[
1
]
<<
16
)
+
((
UINT32
)
x
[
2
]
<<
8
)
+
(
UINT32
)
x
[
3
]));
return
((
UINT32
)(((
UINT32
)
x
[
0
]
<<
24
)
+
((
UINT32
)
x
[
1
]
<<
16
)
+
((
UINT32
)
x
[
2
]
<<
8
)
+
(
UINT32
)
x
[
3
]));
case
CDI_LITTLEENDIAN
:
return
((
UINT32
)(((
UINT32
)
x
[
3
]
<<
24
)
+
((
UINT32
)
x
[
2
]
<<
16
)
+
((
UINT32
)
x
[
1
]
<<
8
)
+
(
UINT32
)
x
[
0
]));
default:
...
...
@@ -31,9 +31,9 @@ UINT32 get_SUINT32(unsigned char *x)
switch
(
HOST_ENDIANNESS
)
{
case
CDI_BIGENDIAN
:
return
((
UINT32
)(((
UINT32
)
x
[
3
]
<<
24
)
+
((
UINT32
)
x
[
2
]
<<
16
)
+
((
UINT32
)
x
[
1
]
<<
8
)
+
(
UINT32
)
x
[
0
]));
return
((
UINT32
)(((
UINT32
)
x
[
3
]
<<
24
)
+
((
UINT32
)
x
[
2
]
<<
16
)
+
((
UINT32
)
x
[
1
]
<<
8
)
+
(
UINT32
)
x
[
0
]));
case
CDI_LITTLEENDIAN
:
return
((
UINT32
)(((
UINT32
)
x
[
0
]
<<
24
)
+
((
UINT32
)
x
[
1
]
<<
16
)
+
((
UINT32
)
x
[
2
]
<<
8
)
+
(
UINT32
)
x
[
3
]));
return
((
UINT32
)(((
UINT32
)
x
[
0
]
<<
24
)
+
((
UINT32
)
x
[
1
]
<<
16
)
+
((
UINT32
)
x
[
2
]
<<
8
)
+
(
UINT32
)
x
[
3
]));
default:
Error
(
"unhandled endianness %d"
,
HOST_ENDIANNESS
);
return
UINT32_C
(
0xFFFFFFFF
);
...
...
@@ -46,11 +46,11 @@ UINT64 get_UINT64(unsigned char *x)
switch
(
HOST_ENDIANNESS
)
{
case
CDI_BIGENDIAN
:
return
((
UINT64
)(((
UINT64
)
x
[
0
]
<<
56
)
+
((
UINT64
)
x
[
1
]
<<
48
)
+
((
UINT64
)
x
[
2
]
<<
40
)
+
((
UINT64
)
x
[
3
]
<<
32
)
+
((
UINT64
)
x
[
4
]
<<
24
)
+
((
UINT64
)
x
[
5
]
<<
16
)
+
((
UINT64
)
x
[
6
]
<<
8
)
+
(
UINT64
)
x
[
7
]));
return
((
UINT64
)(((
UINT64
)
x
[
0
]
<<
56
)
+
((
UINT64
)
x
[
1
]
<<
48
)
+
((
UINT64
)
x
[
2
]
<<
40
)
+
((
UINT64
)
x
[
3
]
<<
32
)
+
((
UINT64
)
x
[
4
]
<<
24
)
+
((
UINT64
)
x
[
5
]
<<
16
)
+
((
UINT64
)
x
[
6
]
<<
8
)
+
(
UINT64
)
x
[
7
]));
case
CDI_LITTLEENDIAN
:
return
((
UINT64
)(((
UINT64
)
x
[
7
]
<<
56
)
+
((
UINT64
)
x
[
6
]
<<
48
)
+
((
UINT64
)
x
[
5
]
<<
40
)
+
((
UINT64
)
x
[
4
]
<<
32
)
+
((
UINT64
)
x
[
3
]
<<
24
)
+
((
UINT64
)
x
[
2
]
<<
16
)
+
((
UINT64
)
x
[
1
]
<<
8
)
+
(
UINT64
)
x
[
0
]));
return
((
UINT64
)(((
UINT64
)
x
[
7
]
<<
56
)
+
((
UINT64
)
x
[
6
]
<<
48
)
+
((
UINT64
)
x
[
5
]
<<
40
)
+
((
UINT64
)
x
[
4
]
<<
32
)
+
((
UINT64
)
x
[
3
]
<<
24
)
+
((
UINT64
)
x
[
2
]
<<
16
)
+
((
UINT64
)
x
[
1
]
<<
8
)
+
(
UINT64
)
x
[
0
]));
default:
Error
(
"unhandled endianness %d"
,
HOST_ENDIANNESS
);
return
UINT64_C
(
0xFFFFFFFFFFFFFFFF
);
...
...
@@ -63,11 +63,11 @@ UINT64 get_SUINT64(unsigned char *x)
switch
(
HOST_ENDIANNESS
)
{
case
CDI_BIGENDIAN
:
return
((
UINT64
)(((
UINT64
)
x
[
7
]
<<
56
)
+
((
UINT64
)
x
[
6
]
<<
48
)
+
((
UINT64
)
x
[
5
]
<<
40
)
+
((
UINT64
)
x
[
4
]
<<
32
)
+
((
UINT64
)
x
[
3
]
<<
24
)
+
((
UINT64
)
x
[
2
]
<<
16
)
+
((
UINT64
)
x
[
1
]
<<
8
)
+
(
UINT64
)
x
[
0
]));
return
((
UINT64
)(((
UINT64
)
x
[
7
]
<<
56
)
+
((
UINT64
)
x
[
6
]
<<
48
)
+
((
UINT64
)
x
[
5
]
<<
40
)
+
((
UINT64
)
x
[
4
]
<<
32
)
+
((
UINT64
)
x
[
3
]
<<
24
)
+
((
UINT64
)
x
[
2
]
<<
16
)
+
((
UINT64
)
x
[
1
]
<<
8
)
+
(
UINT64
)
x
[
0
]));
case
CDI_LITTLEENDIAN
:
return
((
UINT64
)(((
UINT64
)
x
[
0
]
<<
56
)
+
((
UINT64
)
x
[
1
]
<<
48
)
+
((
UINT64
)
x
[
2
]
<<
40
)
+
((
UINT64
)
x
[
3
]
<<
32
)
+
((
UINT64
)
x
[
4
]
<<
24
)
+
((
UINT64
)
x
[
5
]
<<
16
)
+
((
UINT64
)
x
[
6
]
<<
8
)
+
(
UINT64
)
x
[
7
]));
return
((
UINT64
)(((
UINT64
)
x
[
0
]
<<
56
)
+
((
UINT64
)
x
[
1
]
<<
48
)
+
((
UINT64
)
x
[
2
]
<<
40
)
+
((
UINT64
)
x
[
3
]
<<
32
)
+
((
UINT64
)
x
[
4
]
<<
24
)
+
((
UINT64
)
x
[
5
]
<<
16
)
+
((
UINT64
)
x
[
6
]
<<
8
)
+
(
UINT64
)
x
[
7
]));
default:
Error
(
"unhandled endianness %d"
,
HOST_ENDIANNESS
);
return
UINT64_C
(
0xFFFFFFFFFFFFFFFF
);
...
...
src/cgribexlib.c
View file @
1b4b6b90
...
...
@@ -667,6 +667,16 @@ double intpow2(int x)
return
(
pow
(
2
.
0
,
(
double
)
x
));
}
/*
icc -g -Wall -O3 -march=native -std=c99 -qopt-report=5 -DTEST_MINMAXVAL -openmp -DOMP_SIMD minmax_val.c
result on hama2 (icc 16.0.0):
float:
minmax_val: fmin: -500000 fmax: 499999 time: 0.63s
double:
minmax_val: fmin: -500000 fmax: 499999 time: 2.98s
orig : fmin: -500000 fmax: 499999 time: 2.83s
simd : fmin: -500000 fmax: 499999 time: 2.82s
avx : fmin: -500000 fmax: 499999 time: 3.17s
gcc -g -Wall -O3 -march=native -std=c99 -DTEST_MINMAXVAL minmax_val.c
result on bailung (gcc 4.8.2):
orig : fmin: -500000 fmax: 499999 time: 4.82s
...
...
@@ -684,16 +694,6 @@ icc -g -Wall -O3 -march=native -std=c99 -qopt-report=5 -DTEST_MINMAXVAL -openmp
simd : fmin: -500000 fmax: 499999 time: 2.83s
avx : fmin: -500000 fmax: 499999 time: 2.92s
icc -g -Wall -O3 -march=native -std=c99 -qopt-report=5 -DTEST_MINMAXVAL -openmp -DOMP_SIMD minmax_val.c
result on hama (icc 15.0.1):
float:
minmax_val: fmin: -500000 fmax: 499999 time: 0.60s
double:
minmax_val: fmin: -500000 fmax: 499999 time: 3.06s
orig : fmin: -500000 fmax: 499999 time: 2.66s
simd : fmin: -500000 fmax: 499999 time: 6.65s
avx : fmin: -500000 fmax: 499999 time: 3.11s
xlc_r -g -O3 -qhot -q64 -qarch=auto -qtune=auto -qreport -DTEST_MINMAXVAL minmax_val.c
result on blizzard (xlc 12):
orig : fmin: -500000 fmax: 499999 time: 7.26s
...
...
@@ -998,7 +998,6 @@ void minmax_val_double_simd(const double *restrict data, long idatasize, double
static
void
minmax_val_double_orig
(
const
double
*
restrict
data
,
long
idatasize
,
double
*
fmin
,
double
*
fmax
)
{
size_t
i
;
size_t
datasize
=
idatasize
;
double
dmin
=
*
fmin
,
dmax
=
*
fmax
;
...
...
@@ -1011,7 +1010,7 @@ void minmax_val_double_orig(const double *restrict data, long idatasize, double
#elif defined (__ICC)
#pragma ivdep
#endif
for
(
i
=
0
;
i
<
datasize
;
++
i
)
for
(
size_t
i
=
0
;
i
<
datasize
;
++
i
)
{
dmin
=
dmin
<
data
[
i
]
?
dmin
:
data
[
i
];
dmax
=
dmax
>
data
[
i
]
?
dmax
:
data
[
i
];
...
...
@@ -1024,7 +1023,6 @@ void minmax_val_double_orig(const double *restrict data, long idatasize, double
static
void
minmax_val_float
(
const
float
*
restrict
data
,
long
idatasize
,
float
*
fmin
,
float
*
fmax
)
{
size_t
i
;
size_t
datasize
=
idatasize
;
float
dmin
=
*
fmin
,
dmax
=
*
fmax
;
...
...
@@ -1037,7 +1035,7 @@ void minmax_val_float(const float *restrict data, long idatasize, float *fmin, f
#elif defined (__ICC)
#pragma ivdep
#endif
for
(
i
=
0
;
i
<
datasize
;
++
i
)
for
(
size_t
i
=
0
;
i
<
datasize
;
++
i
)
{
dmin
=
dmin
<
data
[
i
]
?
dmin
:
data
[
i
];
dmax
=
dmax
>
data
[
i
]
?
dmax
:
data
[
i
];
...
...
@@ -1053,9 +1051,6 @@ void minmax_val_float(const float *restrict data, long idatasize, float *fmin, f
// TEST
#if defined(OMP_SIMD)
//#pragma omp declare reduction(xmin : double : omp_out = omp_in > omp_out ? omp_out : omp_in) initializer( omp_priv = { 1.e300 })
//#pragma omp declare reduction(xmax : double : omp_out = omp_in < omp_out ? omp_out : omp_in) initializer( omp_priv = { -1.e300 })
#if defined(GNUC_PUSH_POP)
#pragma GCC push_options
#pragma GCC optimize ("O3", "fast-math")
...
...
@@ -1063,15 +1058,13 @@ void minmax_val_float(const float *restrict data, long idatasize, float *fmin, f
static
void
minmax_val_double_simd
(
const
double
*
restrict
data
,
long
idatasize
,
double
*
fmin
,
double
*
fmax
)
{
size_t
i
;
size_t
datasize
=
idatasize
;
double
dmin
=
*
fmin
,
dmax
=
*
fmax
;
#if defined(_OPENMP)
//#pragma omp simd reduction(xmin:dmin) reduction(xmax:dmax)
#pragma omp simd
#pragma omp simd reduction(min:dmin) reduction(max:dmax)
#endif
for
(
i
=
0
;
i
<
datasize
;
++
i
)
for
(
size_t
i
=
0
;
i
<
datasize
;
++
i
)
{
dmin
=
dmin
<
data
[
i
]
?
dmin
:
data
[
i
];
dmax
=
dmax
>
data
[
i
]
?
dmax
:
data
[
i
];
...
...
@@ -1202,7 +1195,7 @@ int main(void)
{
float
fmin
,
fmax
;
float
*
data_sp
=
(
float
*
)
M
alloc
(
datasize
*
sizeof
(
float
));
float
*
data_sp
=
(
float
*
)
m
alloc
(
datasize
*
sizeof
(
float
));
for
(
long
i
=
0
;
i
<
datasize
/
2
;
i
++
)
data_sp
[
i
]
=
(
float
)
(
i
);
for
(
long
i
=
datasize
/
2
;
i
<
datasize
;
i
++
)
data_sp
[
i
]
=
(
float
)
(
-
datasize
+
i
);
...
...
@@ -1217,12 +1210,12 @@ int main(void)
}
t_end
=
dtime
();
printf
(
"minmax_val: fmin: %ld fmax: %ld time: %6.2fs
\n
"
,
(
long
)
fmin
,
(
long
)
fmax
,
t_end
-
t_begin
);
F
ree
(
data_sp
);
f
ree
(
data_sp
);
}
{
double
fmin
,
fmax
;
double
*
data_dp
=
(
double
*
)
M
alloc
(
datasize
*
sizeof
(
double
));
double
*
data_dp
=
(
double
*
)
m
alloc
(
datasize
*
sizeof
(
double
));
// for ( long i = datasize-1; i >= 0; i-- ) data[i] = (double) (-datasize/2 + i);
for
(
long
i
=
0
;
i
<
datasize
/
2
;
i
++
)
data_dp
[
i
]
=
(
double
)
(
i
);
...
...
@@ -1288,7 +1281,7 @@ int main(void)
t_end
=
dtime
();
printf
(
"pwr6u6 : fmin: %ld fmax: %ld time: %6.2fs
\n
"
,
(
long
)
fmin
,
(
long
)
fmax
,
t_end
-
t_begin
);
#endif
F
ree
(
data_dp
);
f
ree
(
data_dp
);
}
return
(
0
);
...
...
@@ -1799,240 +1792,6 @@ int main(void)
}
#endif // TEST_ENCODE
#undef DISABLE_SIMD
#undef _ENABLE_AVX
#undef _ENABLE_SSE4_1
//#undef _GET_X86_COUNTER
//#undef _GET_MACH_COUNTER
//#undef _GET_IBM_COUNTER
//#undef _ARCH_PWR6
#if defined _GET_IBM_COUNTER
#include
<libhpc.h>
#elif defined _GET_X86_COUNTER
#include
<x86intrin.h>
#elif defined _GET_MACH_COUNTER
#include
<mach/mach_time.h>
#endif
#ifdef __cplusplus
#define __STDC_FORMAT_MACROS
#endif
#include
<inttypes.h>
#if defined(__GNUC__) && (__GNUC__ >= 4)
#elif defined(__ICC) && (__ICC >= 1100)
#elif defined(__clang__)
#else
#define DISABLE_SIMD
#endif
#define DISABLE_SIMD
#ifdef DISABLE_SIMD
# ifdef ENABLE_AVX
# define _ENABLE_AVX
# endif
# ifdef ENABLE_SSE4_1
# define _ENABLE_SSE4_1
# endif
#endif
#ifndef DISABLE_SIMD
# ifdef __AVX__
# define _ENABLE_AVX
# endif
# ifdef __SSE4_1__
# define _ENABLE_SSE4_1
# endif
#endif
#if defined _ENABLE_AVX
#include
<immintrin.h>
#elif defined _ENABLE_SSE4_1
#include
<smmintrin.h>
#endif
#if defined _ENABLE_AVX
static
void
avx_decode_array_2byte_double
(
size_t
datasize
,
const
unsigned
char
*
restrict
igrib
,
double
*
restrict
fpdata
,
double
fmin
,
double
zscale
)
{
size_t
i
,
j
;
size_t
nframes
=
datasize
;
size_t
residual
;
size_t
ofs
;
double
dval
;
double
*
data
=
fpdata
;
__m128i
*
sgrib
;
__m128i
mask
=
_mm_set_epi8
(
14
,
15
,
12
,
13
,
10
,
11
,
8
,
9
,
6
,
7
,
4
,
5
,
2
,
3
,
0
,
1
);
__m256d
ymm0
=
_mm256_set1_pd
(
fmin
);
__m256d
ymm1
=
_mm256_set1_pd
(
zscale
);
__m128i
xmm0
,
xmm1
,
xmm2
,
xmm3
;
__m256d
ymm2
,
ymm3
;
i
=
-
1
;
while
(
((
unsigned
long
)
data
)
%
32
!=
0
&&
datasize
>
0
)
{
i
++
;
dval
=
(((
int
)
igrib
[
2
*
i
]
<<
8
)
|
(
int
)
igrib
[
2
*
i
+
1
]);
fpdata
[
i
]
=
fmin
+
zscale
*
dval
;
data
++
;
nframes
--
;
}
if
(
i
==
-
1
)
i
=
0
;
sgrib
=
(
__m128i
*
)
(
igrib
+
i
);
while
(
nframes
>=
16
)
{
xmm0
=
_mm_loadu_si128
((
__m128i
*
)
sgrib
);
xmm0
=
_mm_shuffle_epi8
(
xmm0
,
mask
);
xmm1
=
_mm_shuffle_epi32
(
xmm0
,
_MM_SHUFFLE
(
1
,
0
,
3
,
2
));
xmm2
=
_mm_cvtepu16_epi32
(
xmm0
);
xmm3
=
_mm_cvtepu16_epi32
(
xmm1
);
ymm2
=
_mm256_cvtepi32_pd
(
xmm2
);
ymm2
=
_mm256_add_pd
(
_mm256_mul_pd
(
ymm2
,
ymm1
),
ymm0
);
(
void
)
_mm256_stream_pd
(
data
,
ymm2
);
ymm3
=
_mm256_cvtepi32_pd
(
xmm3
);
ymm3
=
_mm256_add_pd
(
_mm256_mul_pd
(
ymm3
,
ymm1
),
ymm0
);
(
void
)
_mm256_stream_pd
(
data
+
4
,
ymm3
);
xmm0
=
_mm_loadu_si128
((
__m128i
*
)
sgrib
+
1
);
xmm0
=
_mm_shuffle_epi8
(
xmm0
,
mask
);
xmm1
=
_mm_shuffle_epi32
(
xmm0
,
_MM_SHUFFLE
(
1
,
0
,
3
,
2
));
xmm2
=
_mm_cvtepu16_epi32
(
xmm0
);
xmm3
=
_mm_cvtepu16_epi32
(
xmm1
);
ymm2
=
_mm256_cvtepi32_pd
(
xmm2
);
ymm2
=
_mm256_add_pd
(
_mm256_mul_pd
(
ymm2
,
ymm1
),
ymm0
);
(
void
)
_mm256_stream_pd
(
data
+
8
,
ymm2
);
ymm3
=
_mm256_cvtepi32_pd
(
xmm3
);
ymm3
=
_mm256_add_pd
(
_mm256_mul_pd
(
ymm3
,
ymm1
),
ymm0
);
(
void
)
_mm256_stream_pd
(
data
+
12
,
ymm3
);
data
+=
16
;
sgrib
+=
2
;
nframes
-=
16
;
}
residual
=
nframes
;
ofs
=
datasize
-
residual
;
for
(
j
=
0
;
j
<
residual
;
j
++
)
{
dval
=
(((
int
)
igrib
[
2
*
(
ofs
+
j
)]
<<
8
)
|
(
int
)
igrib
[
2
*
(
ofs
+
j
)
+
1
]);
fpdata
[
ofs
+
j
]
=
fmin
+
zscale
*
dval
;
}
return
;
}
#elif defined _ENABLE_SSE4_1
static
void
sse41_decode_array_2byte_double
(
size_t
datasize
,
const
unsigned
char
*
restrict
igrib
,
double
*
restrict
fpdata
,
double
fmin
,
double
zscale
)
{
size_t
i
,
j
;
size_t
nframes
=
datasize
;
size_t
residual
;
size_t
ofs
;
double
dval
;
double
*
data
=
fpdata
;
__m128i
*
sgrib
;
__m128i
mask
=
_mm_set_epi8
(
14
,
15
,
12
,
13
,
10
,
11
,
8
,
9
,
6
,
7
,
4
,
5
,
2
,
3
,
0
,
1
);
__m128d
dmm8
=
_mm_set1_pd
(
fmin
);
__m128d
dmm9
=
_mm_set1_pd
(
zscale
);
__m128i
xmm4
,
xmm5
;
__m128i
xmm6
,
xmm7
;
__m128d
dmm0
,
dmm1
,
dmm2
,
dmm3
;
__m128d
dmm4
,
dmm5
,
dmm6
,
dmm7
;
i
=
-
1
;
while
(
((
unsigned
long
)
data
)
%
32
!=
0
&&
datasize
>
0
)
{
i
++
;
dval
=
(((
int
)
igrib
[
2
*
i
]
<<
8
)
|
(
int
)
igrib
[
2
*
i
+
1
]);
fpdata
[
i
]
=
fmin
+
zscale
*
dval
;
data
++
;
nframes
--
;
}
if
(
i
==
-
1
)
i
=
0
;
sgrib
=
(
__m128i
*
)
(
igrib
+
i
);
while
(
nframes
>=
16
)
{
xmm5
=
_mm_loadu_si128
((
__m128i
*
)
sgrib
);
xmm5
=
_mm_shuffle_epi8
(
xmm5
,
mask
);
xmm4
=
_mm_srli_si128
(
xmm5
,
8
);
xmm6
=
_mm_cvtepu16_epi32
(
xmm5
);
dmm0
=
_mm_cvtepi32_pd
(
xmm6
);
dmm0
=
_mm_add_pd
(
_mm_mul_pd
(
dmm0
,
dmm9
),
dmm8
);
(
void
)
_mm_stream_pd
(
data
,
dmm0
);
xmm7
=
_mm_srli_si128
(
xmm6
,
8
);
dmm1
=
_mm_cvtepi32_pd
(
xmm7
);
dmm1
=
_mm_add_pd
(
_mm_mul_pd
(
dmm1
,
dmm9
),
dmm8
);
(
void
)
_mm_stream_pd
(
data
+
2
,
dmm1
);
xmm6
=
_mm_cvtepu16_epi32
(
xmm4
);
dmm2
=
_mm_cvtepi32_pd
(
xmm6
);
dmm2
=
_mm_add_pd
(
_mm_mul_pd
(
dmm2
,
dmm9
),
dmm8
);
(
void
)
_mm_stream_pd
(
data
+
4
,
dmm2
);
xmm7
=
_mm_srli_si128
(
xmm6
,
8
);
dmm3
=
_mm_cvtepi32_pd
(
xmm7
);
dmm3
=
_mm_add_pd
(
_mm_mul_pd
(
dmm3
,
dmm9
),
dmm8
);
(
void
)
_mm_stream_pd
(
data
+
6
,
dmm3
);
xmm5
=
_mm_loadu_si128
((
__m128i
*
)
sgrib
+
1
);
xmm5
=
_mm_shuffle_epi8
(
xmm5
,
mask
);
xmm4
=
_mm_srli_si128
(
xmm5
,
8
);
xmm6
=
_mm_cvtepu16_epi32
(
xmm5
);
dmm4
=
_mm_cvtepi32_pd
(
xmm6
);
dmm4
=
_mm_add_pd
(
_mm_mul_pd
(
dmm4
,
dmm9
),
dmm8
);
(
void
)
_mm_stream_pd
(
data
+
8
,
dmm4
);
xmm7
=
_mm_srli_si128
(
xmm6
,
8
);
dmm5
=
_mm_cvtepi32_pd
(
xmm7
);
dmm5
=
_mm_add_pd
(
_mm_mul_pd
(
dmm5
,
dmm9
),
dmm8
);
(
void
)
_mm_stream_pd
(
data
+
10
,
dmm5
);
xmm6
=
_mm_cvtepu16_epi32
(
xmm4
);
dmm6
=
_mm_cvtepi32_pd
(
xmm6
);
dmm6
=
_mm_add_pd
(
_mm_mul_pd
(
dmm6
,
dmm9
),
dmm8
);
(
void
)
_mm_stream_pd
(
data
+
12
,
dmm6
);
xmm7
=
_mm_srli_si128
(
xmm6
,
8
);
dmm7
=
_mm_cvtepi32_pd
(
xmm7
);
dmm7
=
_mm_add_pd
(
_mm_mul_pd
(
dmm7
,
dmm9
),
dmm8
);
(
void
)
_mm_stream_pd
(
data
+
14
,
dmm7
);
data
+=
16
;
sgrib
+=
2
;
nframes
-=
16
;
}
residual
=
nframes
;
ofs
=
datasize
-
residual
;
for
(
j
=
0
;
j
<
residual
;
j
++
)
{
dval
=
(((
int
)
igrib
[
2
*
(
ofs
+
j
)]
<<
8
)
|
(
int
)
igrib
[
2
*
(
ofs
+
j
)
+
1
]);
fpdata
[
ofs
+
j
]
=
fmin
+
zscale
*
dval
;
}
return
;
}
#endif
#undef DISABLE_SIMD
#undef _ENABLE_AVX
#undef _ENABLE_SSE4_1
...
...
@@ -9327,7 +9086,6 @@ void gribPrintSec3_float(int *isec0, int *isec3, float *fsec3) {gribPrintSec3SP(
void
gribPrintSec4_float
(
int
*
isec0
,
int
*
isec4
,
float
*
fsec4
)
{
gribPrintSec4SP
(
isec0
,
isec4
,
fsec4
);}
#ifdef T
#undef T
#endif
...
...
@@ -9335,8 +9093,8 @@ void gribPrintSec4_float(int *isec0, int *isec4, float *fsec4) {gribPrintSec4SP(
#ifdef T
static
void
TEMPLATE
(
decode_array_common
,
T
)(
const
unsigned
char
*
restrict
igrib
,
long
jlend
,
int
NumBits
,
T
fmin
,
T
zscale
,
T
*
restrict
fpdata
)
void
TEMPLATE
(
decode_array_common
,
T
)(
const
unsigned
char
*
restrict
igrib
,
long
jlend
,
int
NumBits
,
T
fmin
,
T
zscale
,
T
*
restrict
fpdata
)
{
/* code from wgrib routine BDS_unpack */
const
unsigned
char
*
bits
=
igrib
;
...
...
@@ -9376,8 +9134,8 @@ static double shift[9] = {1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0};
#endif
static
void
TEMPLATE
(
decode_array_common2
,
T
)(
const
unsigned
char
*
restrict
igrib
,
long
jlend
,
int
NumBits
,
T
fmin
,
T
zscale
,
T
*
restrict
fpdata
)
void
TEMPLATE
(
decode_array_common2
,
T
)(
const
unsigned
char
*
restrict
igrib
,
long
jlend
,
int
NumBits
,
T
fmin
,
T
zscale
,
T
*
restrict
fpdata
)
{
/* code from wgrib routine BDS_unpack */
const
unsigned
char
*
bits
=
igrib
;
...
...
@@ -9418,6 +9176,33 @@ void TEMPLATE(decode_array_common2,T)(const unsigned char * restrict igrib, long
}
}
#define gribSwapByteOrder_uint16(ui16) ((ui16<<8) | (ui16>>8))
static
void
TEMPLATE
(
decode_array_2byte
,
T
)(
size_t
jlend
,
const
unsigned
char
*
restrict
igrib
,
T
*
fpdata
,
T
fmin
,
T
zscale
)
{
U_BYTEORDER
;
const
uint16_t
*
restrict
sgrib
=
(
uint16_t
*
)
(
igrib
);
if
(
IS_BIGENDIAN
()
)
{
for
(
size_t
i
=
0
;
i
<
jlend
;
i
++
)
{
fpdata
[
i
]
=
fmin
+
zscale
*
sgrib
[
i
];
}
}
else
{
uint16_t
ui16
;
for
(
size_t
i
=
0
;
i
<
jlend
;
i
++
)
{
ui16
=
gribSwapByteOrder_uint16
(
sgrib
[
i
]);
fpdata
[
i
]
=
fmin
+
zscale
*
ui16
;
}
}
}
static
void
TEMPLATE
(
decode_array
,
T
)(
const
unsigned
char
*
restrict
igrib
,
long
jlend
,
int
numBits
,
T
fmin
,
T
zscale
,
T
*
restrict
fpdata
)
...
...
@@ -9503,57 +9288,7 @@ void TEMPLATE(decode_array,T)(const unsigned char *restrict igrib, long jlend, i
}
else
if
(
numBits
==
16
)
{
#ifdef _GET_IBM_COUNTER
hpmStart
(
6
,
"unpack 16 bit base"
);
#elif defined _GET_X86_COUNTER
start_decode
=
_rdtsc
();
#elif defined _GET_MACH_COUNTER
start_decode
=
mach_absolute_time
();
#endif
if
(
sizeof
(
T
)
==
sizeof
(
double
)
)
{
#if defined _ENABLE_AVX
printf
(
"AVX selected ...
\n
"
);
avx_decode_array_2byte_double
((
size_t
)
jlend
,
igrib
,
fpdata
,
fmin
,
zscale
);
#elif defined _ENABLE_SSE4_1
printf
(
"SSE4 selected ...
\n
"
);
sse41_decode_array_2byte_double
((
size_t
)
jlend
,
igrib
,
fpdata
,
fmin
,
zscale
);
#else
for
(
i
=
0
;
i
<
jlend
;
i
++
)
{
dval
=
(((
int
)
igrib
[
2
*
i
]
<<
8
)
|
(
int
)
igrib
[
2
*
i
+
1
]);
fpdata
[
i
]
=
fmin
+
zscale
*
dval
;
}
#endif
}
else
{
for
(
i
=
0
;
i
<
jlend
;
i
++
)
{
dval
=
(((
int
)
igrib
[
2
*
i
]
<<
8
)
|
(
int
)
igrib
[
2
*
i
+
1
]);
fpdata
[
i
]
=
fmin
+
zscale
*
dval
;
}
}
#if defined _GET_X86_COUNTER || defined _GET_MACH_COUNTER
#if defined _GET_X86_COUNTER
end_decode
=
_rdtsc
();
#elif defined _GET_MACH_COUNTER
end_decode
=
mach_absolute_time
();
#endif
#if defined _ENABLE_AVX
printf
(
"AVX encoding cycles:: %"
PRIu64
"
\n
"
,
end_decode
-
start_decode
);
#elif defined _ENABLE_SSE4_1
printf
(
"SSE 4.1 encoding cycles:: %"
PRIu64
"
\n
"
,
end_decode
-
start_decode
);
#else
printf
(
"loop encoding cycles:: %"
PRIu64
"
\n
"
,
end_decode
-
start_decode
);
#endif
#endif
#ifdef _GET_IBM_COUNTER
hpmStop
(
6
);
#endif
TEMPLATE
(
decode_array_2byte
,
T
)((
size_t
)
jlend
,
igrib
,
fpdata
,
fmin
,
zscale
);
}
else
if
(
numBits
==
24
)
for
(
i
=
0
;
i
<
jlend
;
i
++
)
...
...
@@ -9593,8 +9328,8 @@ void TEMPLATE(decode_array,T)(const unsigned char *restrict igrib, long jlend, i
#ifdef T
static
void
TEMPLATE
(
decode_array_common
,
T
)(
const
unsigned
char
*
restrict
igrib
,
long
jlend
,
int
NumBits
,
T
fmin
,
T
zscale
,
T
*
restrict
fpdata
)
void
TEMPLATE
(
decode_array_common
,
T
)(
const
unsigned
char
*
restrict
igrib
,
long
jlend
,
int
NumBits
,
T
fmin
,
T
zscale
,
T
*
restrict
fpdata
)
{
/* code from wgrib routine BDS_unpack */
const
unsigned
char
*
bits
=
igrib
;
...
...
@@ -9634,8 +9369,8 @@ static double shift[9] = {1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0};
#endif
static
void
TEMPLATE
(
decode_array_common2
,
T
)(
const
unsigned
char
*
restrict
igrib
,
long
jlend
,
int
NumBits
,
T
fmin
,
T
zscale
,
T
*
restrict
fpdata
)
void
TEMPLATE
(
decode_array_common2
,
T
)(
const
unsigned
char
*
restrict
igrib
,
long
jlend
,
int
NumBits
,
T
fmin
,
T
zscale
,
T
*
restrict
fpdata
)
{
/* code from wgrib routine BDS_unpack */
const
unsigned
char
*
bits
=
igrib
;
...
...
@@ -9676,6 +9411,33 @@ void TEMPLATE(decode_array_common2,T)(const unsigned char * restrict igrib, long
}
}
#define gribSwapByteOrder_uint16(ui16) ((ui16<<8) | (ui16>>8))
static
void
TEMPLATE
(
decode_array_2byte
,
T
)(
size_t
jlend
,
const
unsigned
char
*
restrict
igrib
,
T
*
fpdata
,
T
fmin
,
T
zscale
)
{
U_BYTEORDER
;
const
uint16_t
*
restrict
sgrib
=
(
uint16_t
*
)
(
igrib
);
if
(
IS_BIGENDIAN
()
)
{
for
(
size_t
i
=
0
;
i
<
jlend
;
i
++
)
{
fpdata
[
i
]
=
fmin
+
zscale
*
sgrib
[
i
];
}
}
else
{
uint16_t
ui16
;
for
(
size_t
i
=
0
;
i
<
jlend
;
i
++
)
{
ui16
=
gribSwapByteOrder_uint16
(
sgrib
[
i
]);
fpdata
[
i
]
=
fmin
+
zscale
*
ui16
;
}
}
}
static
void
TEMPLATE
(
decode_array
,
T
)(
const
unsigned
char
*
restrict
igrib
,
long
jlend
,
int
numBits
,
T
fmin
,
T
zscale
,
T
*
restrict
fpdata
)
...
...
@@ -9761,57 +9523,7 @@ void TEMPLATE(decode_array,T)(const unsigned char *restrict igrib, long jlend, i
}
else
if
(
numBits
==
16
)
{
#ifdef _GET_IBM_COUNTER
hpmStart
(
6
,
"unpack 16 bit base"
);