Skip to content

Commit 1b212bd

Browse files
authored
ENH: Use AVX-512 for np.frexp and np.ldexp (numpy#16371)
* TST: Adding tests to validate strided np.ldexp and np.frexp * ENH: Use AVX-512 for float and double np.ldexp
1 parent 55ecf64 commit 1b212bd

File tree

6 files changed

+272
-8
lines changed

6 files changed

+272
-8
lines changed

benchmarks/benchmarks/bench_avx.py

+17
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
'floor',
1515
'ceil' ,
1616
'trunc',
17+
'frexp',
1718
'isnan',
1819
'isfinite',
1920
'isinf',
@@ -60,6 +61,22 @@ def setup(self, ufuncname, dtype, stride):
6061
def time_ufunc(self, ufuncname, dtype, stride):
6162
self.f(self.arr1[::stride], self.arr2[::stride])
6263

64+
class AVX_ldexp(Benchmark):
65+
66+
params = [dtype, stride]
67+
param_names = ['dtype', 'stride']
68+
timeout = 10
69+
70+
def setup(self, dtype, stride):
71+
np.seterr(all='ignore')
72+
self.f = getattr(np, 'ldexp')
73+
N = 10000
74+
self.arr1 = np.array(np.random.rand(stride*N), dtype=dtype)
75+
self.arr2 = np.array(np.random.rand(stride*N), dtype='i')
76+
77+
def time_ufunc(self, dtype, stride):
78+
self.f(self.arr1[::stride], self.arr2[::stride])
79+
6380
cmplx_bfuncs = ['add',
6481
'subtract',
6582
'multiply',

numpy/core/code_generators/generate_umath.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -898,10 +898,10 @@ def english_upper(s):
898898
docstrings.get('numpy.core.umath.ldexp'),
899899
None,
900900
[TypeDescription('e', None, 'ei', 'e'),
901-
TypeDescription('f', None, 'fi', 'f'),
901+
TypeDescription('f', None, 'fi', 'f', simd=['avx512_skx']),
902902
TypeDescription('e', FuncNameSuffix('long'), 'el', 'e'),
903903
TypeDescription('f', FuncNameSuffix('long'), 'fl', 'f'),
904-
TypeDescription('d', None, 'di', 'd'),
904+
TypeDescription('d', None, 'di', 'd', simd=['avx512_skx']),
905905
TypeDescription('d', FuncNameSuffix('long'), 'dl', 'd'),
906906
TypeDescription('g', None, 'gi', 'g'),
907907
TypeDescription('g', FuncNameSuffix('long'), 'gl', 'g'),
@@ -912,8 +912,8 @@ def english_upper(s):
912912
docstrings.get('numpy.core.umath.frexp'),
913913
None,
914914
[TypeDescription('e', None, 'e', 'ei'),
915-
TypeDescription('f', None, 'f', 'fi'),
916-
TypeDescription('d', None, 'd', 'di'),
915+
TypeDescription('f', None, 'f', 'fi', simd=['avx512_skx']),
916+
TypeDescription('d', None, 'd', 'di', simd=['avx512_skx']),
917917
TypeDescription('g', None, 'g', 'gi'),
918918
],
919919
),

numpy/core/src/umath/loops.c.src

+16
Original file line numberDiff line numberDiff line change
@@ -2136,6 +2136,14 @@ NPY_NO_EXPORT void
21362136
}
21372137
}
21382138

2139+
NPY_NO_EXPORT void
2140+
@TYPE@_frexp_avx512_skx(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
2141+
{
2142+
if (!run_unary_two_out_avx512_skx_frexp_@TYPE@(args, dimensions, steps)) {
2143+
@TYPE@_frexp(args, dimensions, steps, func);
2144+
}
2145+
}
2146+
21392147
NPY_NO_EXPORT void
21402148
@TYPE@_ldexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
21412149
{
@@ -2146,6 +2154,14 @@ NPY_NO_EXPORT void
21462154
}
21472155
}
21482156

2157+
NPY_NO_EXPORT void
2158+
@TYPE@_ldexp_avx512_skx(char **args, const npy_intp *dimensions, const npy_intp *steps, void *func)
2159+
{
2160+
if (!run_binary_avx512_skx_ldexp_@TYPE@(args, dimensions, steps)) {
2161+
@TYPE@_ldexp(args, dimensions, steps, func);
2162+
}
2163+
}
2164+
21492165
NPY_NO_EXPORT void
21502166
@TYPE@_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
21512167
{

numpy/core/src/umath/loops.h.src

+6
Original file line numberDiff line numberDiff line change
@@ -338,9 +338,15 @@ NPY_NO_EXPORT void
338338
NPY_NO_EXPORT void
339339
@TYPE@_frexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
340340

341+
NPY_NO_EXPORT void
342+
@TYPE@_frexp_avx512_skx(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
343+
341344
NPY_NO_EXPORT void
342345
@TYPE@_ldexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
343346

347+
NPY_NO_EXPORT void
348+
@TYPE@_ldexp_avx512_skx(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
349+
344350
NPY_NO_EXPORT void
345351
@TYPE@_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
346352

numpy/core/src/umath/simd.inc.src

+201-4
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,13 @@ nomemoverlap(char *ip,
120120
(nomemoverlap(args[0], steps[0] * dimensions[0], args[2], steps[2] * dimensions[0])) && \
121121
(nomemoverlap(args[1], steps[1] * dimensions[0], args[2], steps[2] * dimensions[0])))
122122

123+
#define IS_UNARY_TWO_OUT_SMALL_STEPS_AND_NOMEMOVERLAP \
124+
((abs(steps[0]) < MAX_STEP_SIZE) && \
125+
(abs(steps[1]) < MAX_STEP_SIZE) && \
126+
(abs(steps[2]) < MAX_STEP_SIZE) && \
127+
(nomemoverlap(args[0], steps[0] * dimensions[0], args[2], steps[2] * dimensions[0])) && \
128+
(nomemoverlap(args[0], steps[0] * dimensions[0], args[1], steps[1] * dimensions[0])))
129+
123130
/*
124131
* 1) Output should be contiguous, can handle strided input data
125132
* 2) Input step should be smaller than MAX_STEP_SIZE for performance
@@ -294,6 +301,42 @@ run_binary_avx512f_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_in
294301

295302

296303
/**end repeat1**/
304+
305+
#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
306+
static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void
307+
AVX512_SKX_ldexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps);
308+
309+
static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void
310+
AVX512_SKX_frexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps);
311+
#endif
312+
313+
static NPY_INLINE int
314+
run_binary_avx512_skx_ldexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
315+
{
316+
#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
317+
if (IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP) {
318+
AVX512_SKX_ldexp_@TYPE@(args, dimensions, steps);
319+
return 1;
320+
}
321+
else
322+
return 0;
323+
#endif
324+
return 0;
325+
}
326+
327+
static NPY_INLINE int
328+
run_unary_two_out_avx512_skx_frexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
329+
{
330+
#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
331+
if (IS_UNARY_TWO_OUT_SMALL_STEPS_AND_NOMEMOVERLAP) {
332+
AVX512_SKX_frexp_@TYPE@(args, dimensions, steps);
333+
return 1;
334+
}
335+
else
336+
return 0;
337+
#endif
338+
return 0;
339+
}
297340
/**end repeat**/
298341

299342
/**begin repeat
@@ -2089,13 +2132,167 @@ AVX512_SKX_@func@_@TYPE@(npy_bool* op, @type@* ip, const npy_intp array_size, co
20892132
* #num_lanes = 16, 8#
20902133
* #vsuffix = ps, pd#
20912134
* #mask = __mmask16, __mmask8#
2092-
* #vtype = __m512, __m512d#
2135+
* #vtype1 = __m512, __m512d#
2136+
* #vtype2 = __m512i, __m256i#
20932137
* #scale = 4, 8#
20942138
* #vindextype = __m512i, __m256i#
20952139
* #vindexsize = 512, 256#
20962140
* #vindexload = _mm512_loadu_si512, _mm256_loadu_si256#
2141+
* #vtype2_load = _mm512_maskz_loadu_epi32, _mm256_maskz_loadu_epi32#
2142+
* #vtype2_gather = _mm512_mask_i32gather_epi32, _mm256_mmask_i32gather_epi32#
2143+
* #vtype2_store = _mm512_mask_storeu_epi32, _mm256_mask_storeu_epi32#
2144+
* #vtype2_scatter = _mm512_mask_i32scatter_epi32, _mm256_mask_i32scatter_epi32#
2145+
* #setzero = _mm512_setzero_epi32, _mm256_setzero_si256#
20972146
*/
20982147

2148+
#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
2149+
static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void
2150+
AVX512_SKX_ldexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
2151+
{
2152+
const npy_intp stride_ip1 = steps[0]/(npy_intp)sizeof(@type@);
2153+
const npy_intp stride_ip2 = steps[1]/(npy_intp)sizeof(int);
2154+
const npy_intp stride_op = steps[2]/(npy_intp)sizeof(@type@);
2155+
const npy_intp array_size = dimensions[0];
2156+
npy_intp num_remaining_elements = array_size;
2157+
@type@* ip1 = (@type@*) args[0];
2158+
int* ip2 = (int*) args[1];
2159+
@type@* op = (@type@*) args[2];
2160+
2161+
@mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
2162+
2163+
/*
2164+
* Note: while generally indices are npy_intp, we ensure that our maximum index
2165+
* will fit in an int32 as a precondition for this function via
2166+
* IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP
2167+
*/
2168+
2169+
npy_int32 index_ip1[@num_lanes@], index_ip2[@num_lanes@], index_op[@num_lanes@];
2170+
for (npy_int32 ii = 0; ii < @num_lanes@; ii++) {
2171+
index_ip1[ii] = ii*stride_ip1;
2172+
index_ip2[ii] = ii*stride_ip2;
2173+
index_op[ii] = ii*stride_op;
2174+
}
2175+
@vindextype@ vindex_ip1 = @vindexload@((@vindextype@*)&index_ip1[0]);
2176+
@vindextype@ vindex_ip2 = @vindexload@((@vindextype@*)&index_ip2[0]);
2177+
@vindextype@ vindex_op = @vindexload@((@vindextype@*)&index_op[0]);
2178+
@vtype1@ zeros_f = _mm512_setzero_@vsuffix@();
2179+
@vtype2@ zeros = @setzero@();
2180+
2181+
while (num_remaining_elements > 0) {
2182+
if (num_remaining_elements < @num_lanes@) {
2183+
load_mask = avx512_get_partial_load_mask_@vsuffix@(
2184+
num_remaining_elements, @num_lanes@);
2185+
}
2186+
@vtype1@ x1;
2187+
@vtype2@ x2;
2188+
if (stride_ip1 == 1) {
2189+
x1 = avx512_masked_load_@vsuffix@(load_mask, ip1);
2190+
}
2191+
else {
2192+
x1 = avx512_masked_gather_@vsuffix@(zeros_f, ip1, vindex_ip1, load_mask);
2193+
}
2194+
if (stride_ip2 == 1) {
2195+
x2 = @vtype2_load@(load_mask, ip2);
2196+
}
2197+
else {
2198+
x2 = @vtype2_gather@(zeros, load_mask, vindex_ip2, ip2, 4);
2199+
}
2200+
2201+
@vtype1@ out = _mm512_scalef_@vsuffix@(x1, _mm512_cvtepi32_@vsuffix@(x2));
2202+
2203+
if (stride_op == 1) {
2204+
_mm512_mask_storeu_@vsuffix@(op, load_mask, out);
2205+
}
2206+
else {
2207+
/* scatter! */
2208+
_mm512_mask_i32scatter_@vsuffix@(op, load_mask, vindex_op, out, @scale@);
2209+
}
2210+
2211+
ip1 += @num_lanes@*stride_ip1;
2212+
ip2 += @num_lanes@*stride_ip2;
2213+
op += @num_lanes@*stride_op;
2214+
num_remaining_elements -= @num_lanes@;
2215+
}
2216+
}
2217+
2218+
static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void
2219+
AVX512_SKX_frexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
2220+
{
2221+
const npy_intp stride_ip1 = steps[0]/(npy_intp)sizeof(@type@);
2222+
const npy_intp stride_op1 = steps[1]/(npy_intp)sizeof(@type@);
2223+
const npy_intp stride_op2 = steps[2]/(npy_intp)sizeof(int);
2224+
const npy_intp array_size = dimensions[0];
2225+
npy_intp num_remaining_elements = array_size;
2226+
@type@* ip1 = (@type@*) args[0];
2227+
@type@* op1 = (@type@*) args[1];
2228+
int* op2 = (int*) args[2];
2229+
2230+
@mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
2231+
2232+
/*
2233+
* Note: while generally indices are npy_intp, we ensure that our maximum index
2234+
* will fit in an int32 as a precondition for this function via
2235+
* IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP
2236+
*/
2237+
2238+
npy_int32 index_ip1[@num_lanes@], index_op1[@num_lanes@], index_op2[@num_lanes@];
2239+
for (npy_int32 ii = 0; ii < @num_lanes@; ii++) {
2240+
index_ip1[ii] = ii*stride_ip1;
2241+
index_op1[ii] = ii*stride_op1;
2242+
index_op2[ii] = ii*stride_op2;
2243+
}
2244+
@vindextype@ vindex_ip1 = @vindexload@((@vindextype@*)&index_ip1[0]);
2245+
@vindextype@ vindex_op1 = @vindexload@((@vindextype@*)&index_op1[0]);
2246+
@vindextype@ vindex_op2 = @vindexload@((@vindextype@*)&index_op2[0]);
2247+
@vtype1@ zeros_f = _mm512_setzero_@vsuffix@();
2248+
2249+
while (num_remaining_elements > 0) {
2250+
if (num_remaining_elements < @num_lanes@) {
2251+
load_mask = avx512_get_partial_load_mask_@vsuffix@(
2252+
num_remaining_elements, @num_lanes@);
2253+
}
2254+
@vtype1@ x1;
2255+
if (stride_ip1 == 1) {
2256+
x1 = avx512_masked_load_@vsuffix@(load_mask, ip1);
2257+
}
2258+
else {
2259+
x1 = avx512_masked_gather_@vsuffix@(zeros_f, ip1, vindex_ip1, load_mask);
2260+
}
2261+
2262+
/*
2263+
* The x86 instructions vpgetmant and vpgetexp do not conform
2264+
* with NumPy's output for special floating points: NAN, +/-INF, +/-0.0
2265+
* We mask these values with spmask to avoid invalid exceptions.
2266+
*/
2267+
@mask@ spmask =_mm512_knot(_mm512_fpclass_@vsuffix@_mask(
2268+
x1, 0b10011111));
2269+
@vtype1@ out1 = _mm512_maskz_getmant_@vsuffix@(
2270+
spmask, x1, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src);
2271+
out1 = _mm512_mask_mov_@vsuffix@(x1, spmask, out1);
2272+
@vtype2@ out2 = _mm512_cvt@vsuffix@_epi32(
2273+
_mm512_maskz_add_@vsuffix@(spmask, _mm512_set1_@vsuffix@(1.0),
2274+
_mm512_maskz_getexp_@vsuffix@(spmask, x1)));
2275+
if (stride_op1 == 1) {
2276+
_mm512_mask_storeu_@vsuffix@(op1, load_mask, out1);
2277+
}
2278+
else {
2279+
_mm512_mask_i32scatter_@vsuffix@(op1, load_mask, vindex_op1, out1, @scale@);
2280+
}
2281+
if (stride_op2 == 1) {
2282+
@vtype2_store@(op2, load_mask, out2);
2283+
}
2284+
else {
2285+
@vtype2_scatter@(op2, load_mask, vindex_op2, out2, 4);
2286+
}
2287+
2288+
ip1 += @num_lanes@*stride_ip1;
2289+
op1 += @num_lanes@*stride_op1;
2290+
op2 += @num_lanes@*stride_op2;
2291+
num_remaining_elements -= @num_lanes@;
2292+
}
2293+
}
2294+
#endif
2295+
20992296
/**begin repeat1
21002297
* #func = maximum, minimum#
21012298
* #vectorf = max, min#
@@ -2131,14 +2328,14 @@ AVX512F_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *s
21312328
@vindextype@ vindex_ip1 = @vindexload@((@vindextype@*)&index_ip1[0]);
21322329
@vindextype@ vindex_ip2 = @vindexload@((@vindextype@*)&index_ip2[0]);
21332330
@vindextype@ vindex_op = @vindexload@((@vindextype@*)&index_op[0]);
2134-
@vtype@ zeros_f = _mm512_setzero_@vsuffix@();
2331+
@vtype1@ zeros_f = _mm512_setzero_@vsuffix@();
21352332

21362333
while (num_remaining_elements > 0) {
21372334
if (num_remaining_elements < @num_lanes@) {
21382335
load_mask = avx512_get_partial_load_mask_@vsuffix@(
21392336
num_remaining_elements, @num_lanes@);
21402337
}
2141-
@vtype@ x1, x2;
2338+
@vtype1@ x1, x2;
21422339
if (stride_ip1 == 1) {
21432340
x1 = avx512_masked_load_@vsuffix@(load_mask, ip1);
21442341
}
@@ -2158,7 +2355,7 @@ AVX512F_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *s
21582355
* this issue to conform with NumPy behaviour.
21592356
*/
21602357
@mask@ nan_mask = _mm512_cmp_@vsuffix@_mask(x1, x1, _CMP_NEQ_UQ);
2161-
@vtype@ out = _mm512_@vectorf@_@vsuffix@(x1, x2);
2358+
@vtype1@ out = _mm512_@vectorf@_@vsuffix@(x1, x2);
21622359
out = _mm512_mask_blend_@vsuffix@(nan_mask, out, x1);
21632360

21642361
if (stride_op == 1) {

numpy/core/tests/test_umath.py

+28
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import fnmatch
44
import itertools
55
import pytest
6+
import sys
67
from fractions import Fraction
78

89
import numpy.core.umath as ncu
@@ -789,6 +790,33 @@ def test_fpclass(self, stride):
789790
assert_equal(np.isfinite(arr_f32[::stride]), finite[::stride])
790791
assert_equal(np.isfinite(arr_f64[::stride]), finite[::stride])
791792

793+
class TestLDExp:
794+
@pytest.mark.parametrize("stride", [-4,-2,-1,1,2,4])
795+
@pytest.mark.parametrize("dtype", ['f', 'd'])
796+
def test_ldexp(self, dtype, stride):
797+
mant = np.array([0.125, 0.25, 0.5, 1., 1., 2., 4., 8.], dtype=dtype)
798+
exp = np.array([3, 2, 1, 0, 0, -1, -2, -3], dtype='i')
799+
out = np.zeros(8, dtype=dtype)
800+
assert_equal(np.ldexp(mant[::stride], exp[::stride], out=out[::stride]), np.ones(8, dtype=dtype)[::stride])
801+
assert_equal(out[::stride], np.ones(8, dtype=dtype)[::stride])
802+
803+
class TestFRExp:
804+
@pytest.mark.parametrize("stride", [-4,-2,-1,1,2,4])
805+
@pytest.mark.parametrize("dtype", ['f', 'd'])
806+
@pytest.mark.skipif(not sys.platform.startswith('linux'),
807+
reason="np.frexp gives different answers for NAN/INF on windows and linux")
808+
def test_frexp(self, dtype, stride):
809+
arr = np.array([np.nan, np.nan, np.inf, -np.inf, 0.0, -0.0, 1.0, -1.0], dtype=dtype)
810+
mant_true = np.array([np.nan, np.nan, np.inf, -np.inf, 0.0, -0.0, 0.5, -0.5], dtype=dtype)
811+
exp_true = np.array([0, 0, 0, 0, 0, 0, 1, 1], dtype='i')
812+
out_mant = np.ones(8, dtype=dtype)
813+
out_exp = 2*np.ones(8, dtype='i')
814+
mant, exp = np.frexp(arr[::stride], out=(out_mant[::stride], out_exp[::stride]))
815+
assert_equal(mant_true[::stride], mant)
816+
assert_equal(exp_true[::stride], exp)
817+
assert_equal(out_mant[::stride], mant_true[::stride])
818+
assert_equal(out_exp[::stride], exp_true[::stride])
819+
792820
# func : [maxulperror, low, high]
793821
avx_ufuncs = {'sqrt' :[1, 0., 100.],
794822
'absolute' :[0, -100., 100.],

0 commit comments

Comments
 (0)