Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
mpim-sw
libcdi
Commits
204e735a
Commit
204e735a
authored
Apr 02, 2012
by
Uwe Schulzweida
Browse files
cgribexlib update
parent
5180bb0d
Changes
2
Hide whitespace changes
Inline
Side-by-side
NEWS
View file @
204e735a
...
...
@@ -4,7 +4,7 @@ CDI NEWS
Version 1.5.5 (?? May 2012):
New features:
*
a
dded single precision support: streamWriteVarFloat, streamWriteVarSliceFloat
*
A
dded single precision support: streamWriteVarFloat, streamWriteVarSliceFloat
Version 1.5.4 (30 January 2012):
...
...
src/cgribexlib.c
View file @
204e735a
/* Automatically generated by m214003 at 2012-0
2-27
, do not edit */
/* Automatically generated by m214003 at 2012-0
4-02
, do not edit */
/* CGRIBEXLIB_VERSION="1.5.2" */
...
...
@@ -12,13 +12,35 @@
#include
<stdarg.h>
#include
<sys/types.h>
#undef _GET_X86_COUNTER
#undef _GET_IBM_COUNTER
//#undef _ARCH_PWR6
//#undef __AVX__
//#undef __SSE2__
#ifdef _GET_IBM_COUNTER
#include
<libhpc.h>
#endif
#ifdef __AVX__
#include
<float.h>
#include
<stdint.h>
#include
<inttypes.h>
#include
<immintrin.h>
#ifdef _GET_X86_COUNTER
#include
<x86intrin.h>
#endif
#else
#ifdef __SSE2__
#include
<float.h>
#include
<pmmintrin.h>
#include
<stdint.h>
#include
<inttypes.h>
#include
<pmmintrin.h>
#ifdef _GET_X86_COUNTER
#include
<x86intrin.h>
#endif
#endif
#endif
#include
"file.h"
#include
"dmemory.h"
...
...
@@ -2804,12 +2826,116 @@ void gribPrintSec4Wave(int *isec4)
}
}
#undef _GET_X86_COUNTER
#undef _GET_IBM_COUNTER
//#undef _ARCH_PWR6
//#undef __AVX__
//#undef __SSE2__
#ifdef _GET_IBM_COUNTER
#endif
#ifdef __AVX__
#ifdef _GET_X86_COUNTER
#endif
#else
#ifdef __SSE2__
#ifdef _GET_X86_COUNTER
#endif
#endif
#endif
#ifdef __AVX__
static
void
avx_minmaxval
(
const
double
*
restrict
buf
,
long
nframes
,
double
*
min
,
double
*
max
)
{
__m256d
current_max
,
current_min
,
work
;
// load max and min values into all four slots of the XMM registers
current_min
=
_mm256_set1_pd
(
*
min
);
current_max
=
_mm256_set1_pd
(
*
max
);
// Work input until "buf" reaches 32 byte alignment
while
(
((
unsigned
long
)
buf
)
%
32
!=
0
&&
nframes
>
0
)
{
// Load the next double into the work buffer
work
=
_mm256_set1_pd
(
*
buf
);
current_min
=
_mm256_min_pd
(
current_min
,
work
);
current_max
=
_mm256_max_pd
(
current_max
,
work
);
buf
++
;
nframes
--
;
}
while
(
nframes
>=
16
)
{
// use 64 byte prefetch?
__builtin_prefetch
(
buf
+
64
,
0
,
0
);
// for GCC 4.3.2+
work
=
_mm256_load_pd
(
buf
);
current_min
=
_mm256_min_pd
(
current_min
,
work
);
current_max
=
_mm256_max_pd
(
current_max
,
work
);
buf
+=
4
;
__builtin_prefetch
(
buf
+
64
,
0
,
0
);
// for GCC 4.3.2+
work
=
_mm256_load_pd
(
buf
);
current_min
=
_mm256_min_pd
(
current_min
,
work
);
current_max
=
_mm256_max_pd
(
current_max
,
work
);
buf
+=
4
;
__builtin_prefetch
(
buf
+
64
,
0
,
0
);
// for GCC 4.3.2+
work
=
_mm256_load_pd
(
buf
);
current_min
=
_mm256_min_pd
(
current_min
,
work
);
current_max
=
_mm256_max_pd
(
current_max
,
work
);
buf
+=
4
;
__builtin_prefetch
(
buf
+
64
,
0
,
0
);
// for GCC 4.3.2+
work
=
_mm256_load_pd
(
buf
);
current_min
=
_mm256_min_pd
(
current_min
,
work
);
current_max
=
_mm256_max_pd
(
current_max
,
work
);
buf
+=
4
;
nframes
-=
16
;
}
// work through aligned buffers
while
(
nframes
>=
4
)
{
work
=
_mm256_load_pd
(
buf
);
current_min
=
_mm256_min_pd
(
current_min
,
work
);
current_max
=
_mm256_max_pd
(
current_max
,
work
);
buf
+=
4
;
nframes
-=
4
;
}
// work through the remainung values
while
(
nframes
>
0
)
{
work
=
_mm256_set1_pd
(
*
buf
);
current_min
=
_mm256_min_pd
(
current_min
,
work
);
current_max
=
_mm256_max_pd
(
current_max
,
work
);
buf
++
;
nframes
--
;
}
// find min & max value through shuffle tricks
work
=
current_min
;
work
=
_mm256_shuffle_pd
(
work
,
work
,
_MM_SHUFFLE
(
2
,
3
,
0
,
1
));
work
=
_mm256_min_pd
(
work
,
current_min
);
current_min
=
work
;
work
=
_mm256_shuffle_pd
(
work
,
work
,
_MM_SHUFFLE
(
1
,
0
,
3
,
2
));
work
=
_mm256_min_pd
(
work
,
current_min
);
_mm256_store_pd
(
min
,
work
);
work
=
current_max
;
work
=
_mm256_shuffle_pd
(
work
,
work
,
_MM_SHUFFLE
(
2
,
3
,
0
,
1
));
work
=
_mm256_max_pd
(
work
,
current_max
);
current_max
=
work
;
work
=
_mm256_shuffle_pd
(
work
,
work
,
_MM_SHUFFLE
(
1
,
0
,
3
,
2
));
work
=
_mm256_max_pd
(
work
,
current_max
);
_mm256_store_pd
(
max
,
work
);
return
;
}
#else
#ifdef __SSE2__
static
void
sse_minmaxval
(
const
double
*
restrict
buf
,
long
nframes
,
double
*
min
,
double
*
max
)
void
sse
2
_minmaxval
(
const
double
*
restrict
buf
,
long
nframes
,
double
*
min
,
double
*
max
)
{
__m128d
current_max
,
current_min
,
work
;
...
...
@@ -2884,19 +3010,89 @@ void sse_minmaxval(const double *restrict buf, long nframes, double *min, double
}
#endif
#endif
static
void
minmaxval
(
const
double
*
restrict
data
,
long
datasize
,
double
*
fmin
,
double
*
fmax
)
{
long
i
;
#ifdef _GET_X86_COUNTER
uint64_t
start_minmax
,
end_minmax
;
#endif
if
(
datasize
<
1
)
return
;
#ifdef _GET_X86_COUNTER
start_minmax
=
_rdtsc
();
#endif
#ifdef __AVX__
avx_minmaxval
(
data
,
datasize
,
fmin
,
fmax
);
#else
#ifdef __SSE2__
sse_minmaxval
(
data
,
datasize
,
fmin
,
fmax
);
sse
2
_minmaxval
(
data
,
datasize
,
fmin
,
fmax
);
#else
#ifdef _ARCH_PWR6
#define __UNROLL_DEPTH_1 7
// to allow pipelining we have to unroll
#ifdef _GET_IBM_COUNTER
hpmStart
(
1
,
"minmax fsel"
);
#endif
{
long
i
,
j
;
long
residual
=
datasize
%
__UNROLL_DEPTH_1
;
long
ofs
=
datasize
-
residual
;
double
register
dmin
[
__UNROLL_DEPTH_1
];
double
register
dmax
[
__UNROLL_DEPTH_1
];
for
(
j
=
0
;
j
<
__UNROLL_DEPTH_1
;
j
++
)
{
dmin
[
j
]
=
data
[
0
];
dmax
[
j
]
=
data
[
0
];
}
for
(
i
=
0
;
i
<
datasize
-
residual
;
i
+=
__UNROLL_DEPTH_1
)
{
for
(
j
=
0
;
j
<
__UNROLL_DEPTH_1
;
j
++
)
{
dmin
[
j
]
=
__fsel
(
dmin
[
j
]
-
data
[
i
+
j
],
data
[
i
+
j
],
dmin
[
j
]);
dmax
[
j
]
=
__fsel
(
data
[
i
+
j
]
-
dmax
[
j
],
data
[
i
+
j
],
dmax
[
j
]);
}
}
for
(
j
=
0
;
j
<
residual
;
j
++
)
{
dmin
[
j
]
=
__fsel
(
dmin
[
j
]
-
data
[
ofs
+
j
],
data
[
ofs
+
j
],
dmin
[
j
]);
dmax
[
j
]
=
__fsel
(
data
[
ofs
+
j
]
-
dmax
[
j
],
data
[
ofs
+
j
],
dmax
[
j
]);
}
for
(
j
=
0
;
j
<
__UNROLL_DEPTH_1
;
j
++
)
{
*
fmin
=
__fsel
(
*
fmin
-
dmin
[
j
],
dmin
[
j
],
*
fmin
);
*
fmax
=
__fsel
(
dmax
[
j
]
-
*
fmax
,
dmax
[
j
],
*
fmax
);
}
}
#ifdef _GET_IBM_COUNTER
hpmStop
(
1
);
#endif
#undef __UNROLL_DEPTH_1
#else
#ifdef _GET_IBM_COUNTER
hpmStart
(
1
,
"minmax base"
);
#endif
{
long
i
;
#if defined (CRAY)
#pragma _CRI ivdep
#elif defined (SX)
...
...
@@ -2904,18 +3100,42 @@ void minmaxval(const double *restrict data, long datasize, double *fmin, double
#elif defined (__uxp__)
#pragma loop novrec
#endif
for
(
i
=
0
;
i
<
datasize
;
++
i
)
{
if
(
*
fmin
>
data
[
i
]
)
*
fmin
=
data
[
i
];
if
(
*
fmax
<
data
[
i
]
)
*
fmax
=
data
[
i
];
/*
*fmin = *fmin < data[i] ? *fmin : data[i];
*fmax = *fmax > data[i] ? *fmax : data[i];
*/
}
for
(
i
=
0
;
i
<
datasize
;
++
i
)
{
if
(
*
fmin
>
data
[
i
]
)
*
fmin
=
data
[
i
];
if
(
*
fmax
<
data
[
i
]
)
*
fmax
=
data
[
i
];
/*
*fmin = *fmin < data[i] ? *fmin : data[i];
*fmax = *fmax > data[i] ? *fmax : data[i];
*/
}
}
#ifdef _GET_IBM_COUNTER
hpmStop
(
1
);
#endif
#endif
#endif
#endif
#ifdef _GET_X86_COUNTER
end_minmax
=
_rdtsc
();
#ifdef __AVX__
printf
(
"AVX cycles:: %"
PRIu64
"
\n
"
,
end_minmax
-
start_minmax
);
#else
#ifdef __SSE2__
printf
(
"SSE2 cycles:: %"
PRIu64
"
\n
"
,
end_minmax
-
start_minmax
);
#else
printf
(
"loop cycles:: %"
PRIu64
"
\n
"
,
end_minmax
-
start_minmax
);
#endif
#endif
#endif
}
return
;
}
int
BitsPerInt
=
(
int
)
(
sizeof
(
int
)
*
8
);
...
...
@@ -3513,11 +3733,14 @@ void encode_double_array_common(int numBits, long packStart, long datasize, GRIB
}
static
void
encode_double_array
(
int
numBits
,
long
PackStart
,
long
datasize
,
GRIBPACK
*
lGrib
,
const
double
*
data
,
double
zref
,
double
factor
,
long
*
gz
)
void
encode_double_array
(
int
numBits
,
long
PackStart
,
long
datasize
,
GRIBPACK
*
restrict
lGrib
,
const
double
*
restrict
data
,
double
zref
,
double
factor
,
long
*
restrict
gz
)
{
long
i
,
z
=
*
gz
;
unsigned
int
ipval
;
unsigned
long
ipval
;
double
tmp
;
if
(
numBits
==
8
)
{
...
...
@@ -3530,7 +3753,8 @@ void encode_double_array(int numBits, long PackStart, long datasize, GRIBPACK *l
#endif
for
(
i
=
PackStart
;
i
<
datasize
;
i
++
)
{
ipval
=
(
unsigned
int
)
((
data
[
i
]
-
zref
)
*
factor
+
0
.
5
);
tmp
=
((
data
[
i
]
-
zref
)
*
factor
+
0
.
5
);
ipval
=
(
unsigned
long
)
tmp
;
/*
if ( ipval > max_nbpv_pow2 ) ipval = max_nbpv_pow2;
if ( ipval < 0 ) ipval = 0;
...
...
@@ -3547,18 +3771,81 @@ void encode_double_array(int numBits, long PackStart, long datasize, GRIBPACK *l
#pragma vdir nodep
#elif defined (__uxp__)
#pragma loop novrec
#endif
#ifdef _ARCH_PWR6
#define __UNROLL_DEPTH_2 8
{
long
j
;
long
residual
=
datasize
%
__UNROLL_DEPTH_2
;
long
ofs
=
datasize
-
residual
;
double
dval
[
__UNROLL_DEPTH_2
];
unsigned
long
ival
[
__UNROLL_DEPTH_2
];
// reducing FP operations to single FMA is slowing down ...
// double dconst;
// (data - zref)*factor+0.5
// = factor*data - zref*factor + 0.5
// dconst = zref*factor-0.5;
#ifdef _GET_IBM_COUNTER
hpmStart
(
2
,
"pack 16bit unrolled"
);
#endif
#pragma nounrollandfuse
for
(
i
=
0
;
i
<
datasize
-
residual
;
i
+=
__UNROLL_DEPTH_2
)
{
for
(
j
=
0
;
j
<
__UNROLL_DEPTH_2
;
j
++
)
{
dval
[
j
]
=
((
data
[
i
+
j
]
-
zref
)
*
factor
+
0
.
5
);
//dval[j] = factor*data[i+j] - dconst;
}
s
for
(
j
=
0
;
j
<
__UNROLL_DEPTH_2
;
j
++
)
{
ival
[
j
]
=
(
unsigned
long
)
dval
[
j
];
// if ( ipval > max_nbpv_pow2 ) ipval = max_nbpv_pow2;
// if ( ipval < 0 ) ipval = 0;
}
for
(
j
=
0
;
j
<
__UNROLL_DEPTH_2
;
j
++
)
{
lGrib
[
z
]
=
ival
[
j
]
>>
8
;
lGrib
[
z
+
1
]
=
ival
[
j
];
z
+=
2
;
}
}
for
(
j
=
0
;
j
<
residual
;
j
++
)
{
dval
[
j
]
=
((
data
[
ofs
+
j
]
-
zref
)
*
factor
+
0
.
5
);
//dval[j] = factor*data[i+j] - dconst;
ival
[
j
]
=
(
unsigned
long
)
dval
[
j
];
lGrib
[
z
]
=
ival
[
j
]
>>
8
;
lGrib
[
z
+
1
]
=
ival
[
j
];
z
+=
2
;
}
#ifdef _GET_IBM_COUNTER
hpmStop
(
2
);
#endif
#undef __UNROLL_DEPTH_2
}
#else
#ifdef _GET_IBM_COUNTER
hpmStart
(
2
,
"pack 16bit base"
);
#endif
for
(
i
=
PackStart
;
i
<
datasize
;
i
++
)
{
ipval
=
(
unsigned
int
)
((
data
[
i
]
-
zref
)
*
factor
+
0
.
5
);
/*
if ( ipval > max_nbpv_pow2 ) ipval = max_nbpv_pow2;
if ( ipval < 0 ) ipval = 0;
*/
lGrib
[
z
]
=
ipval
>>
8
;
{
tmp
=
((
data
[
i
]
-
zref
)
*
factor
+
0
.
5
);
ipval
=
(
unsigned
long
)
tmp
;
/*
if ( ipval > max_nbpv_pow2 ) ipval = max_nbpv_pow2;
if ( ipval < 0 ) ipval = 0;
*/
lGrib
[
z
]
=
ipval
>>
8
;
lGrib
[
z
+
1
]
=
ipval
;
z
+=
2
;
}
z
+=
2
;
}
#ifdef _GET_IBM_COUNTER
hpmStop
(
2
);
#endif
#endif
}
else
if
(
numBits
==
24
)
{
...
...
@@ -3571,7 +3858,8 @@ void encode_double_array(int numBits, long PackStart, long datasize, GRIBPACK *l
#endif
for
(
i
=
PackStart
;
i
<
datasize
;
i
++
)
{
ipval
=
(
unsigned
int
)
((
data
[
i
]
-
zref
)
*
factor
+
0
.
5
);
tmp
=
((
data
[
i
]
-
zref
)
*
factor
+
0
.
5
);
ipval
=
(
unsigned
long
)
tmp
;
/*
if ( ipval > max_nbpv_pow2 ) ipval = max_nbpv_pow2;
if ( ipval < 0 ) ipval = 0;
...
...
@@ -3593,7 +3881,8 @@ void encode_double_array(int numBits, long PackStart, long datasize, GRIBPACK *l
#endif
for
(
i
=
PackStart
;
i
<
datasize
;
i
++
)
{
ipval
=
(
unsigned
int
)
((
data
[
i
]
-
zref
)
*
factor
+
0
.
5
);
tmp
=
((
data
[
i
]
-
zref
)
*
factor
+
0
.
5
);
ipval
=
(
unsigned
long
)
tmp
;
/*
if ( ipval > max_nbpv_pow2 ) ipval = max_nbpv_pow2;
if ( ipval < 0 ) ipval = 0;
...
...
@@ -9270,7 +9559,7 @@ int gribUnzip(unsigned char *dbuf, long dbufsize, unsigned char *sbuf, long sbu
return
(
gribLen
);
}
static
const
char
grb_libvers
[]
=
"1.5.2"
" of ""
Feb
2
7
2012"" ""1
3:52:12
"
;
static
const
char
grb_libvers
[]
=
"1.5.2"
" of ""
Apr
2 2012"" ""1
0:03:59
"
;
const
char
*
cgribexLibraryVersion
(
void
)
{
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment