Skip to content

Commit 8b901c7

Browse files
authored
ENH: Use AVX-512 for np.isnan, np.infinite, np.isinf and np.signbit (numpy#16334)
* ENH: Use AVX-512 for np.isnan, np.infinite, np.isinf and np.signbit * TST: Add tests to validate isnan, isfinite, signbit and isinf ufuncs * BENCH: Adding benchmarks for isnan, isinf, isfinite and signbit
1 parent 7bb4697 commit 8b901c7

File tree

8 files changed

+173
-10
lines changed

8 files changed

+173
-10
lines changed

benchmarks/benchmarks/bench_avx.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,11 @@
1313
'rint',
1414
'floor',
1515
'ceil' ,
16-
'trunc']
16+
'trunc',
17+
'isnan',
18+
'isfinite',
19+
'isinf',
20+
'signbit']
1721
stride = [1, 2, 4]
1822
dtype = ['f', 'd']
1923

numpy/core/code_generators/generate_umath.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -843,7 +843,7 @@ def english_upper(s):
843843
Ufunc(1, 1, None,
844844
docstrings.get('numpy.core.umath.isnan'),
845845
'PyUFunc_IsFiniteTypeResolver',
846-
TD(noobj, out='?'),
846+
TD(noobj, simd=[('avx512_skx', 'fd')], out='?'),
847847
),
848848
'isnat':
849849
Ufunc(1, 1, None,
@@ -855,19 +855,19 @@ def english_upper(s):
855855
Ufunc(1, 1, None,
856856
docstrings.get('numpy.core.umath.isinf'),
857857
'PyUFunc_IsFiniteTypeResolver',
858-
TD(noobj, out='?'),
858+
TD(noobj, simd=[('avx512_skx', 'fd')], out='?'),
859859
),
860860
'isfinite':
861861
Ufunc(1, 1, None,
862862
docstrings.get('numpy.core.umath.isfinite'),
863863
'PyUFunc_IsFiniteTypeResolver',
864-
TD(noobj, out='?'),
864+
TD(noobj, simd=[('avx512_skx', 'fd')], out='?'),
865865
),
866866
'signbit':
867867
Ufunc(1, 1, None,
868868
docstrings.get('numpy.core.umath.signbit'),
869869
None,
870-
TD(flts, out='?'),
870+
TD(flts, simd=[('avx512_skx', 'fd')], out='?'),
871871
),
872872
'copysign':
873873
Ufunc(2, 1, None,

numpy/core/include/numpy/npy_common.h

+7
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,13 @@
6464
#define NPY_GCC_TARGET_AVX512F
6565
#endif
6666

67+
#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX && defined HAVE_LINK_AVX512_SKX
68+
#define NPY_GCC_TARGET_AVX512_SKX __attribute__((target("avx512f,avx512dq,avx512vl,avx512bw,avx512cd")))
69+
#elif defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS
70+
#define NPY_GCC_TARGET_AVX512_SKX __attribute__((target("avx512f,avx512dq,avx512vl,avx512bw,avx512cd")))
71+
#else
72+
#define NPY_GCC_TARGET_AVX512_SKX
73+
#endif
6774
/*
6875
* mark an argument (starting from 1) that must not be NULL and is not checked
6976
* DO NOT USE IF FUNCTION CHECKS FOR NULL!! the compiler will remove the check

numpy/core/setup_common.py

+11
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,10 @@ def check_api_version(apiversion, codegen_dir):
147147
"stdio.h", "LINK_AVX2"),
148148
("__asm__ volatile", '"vpaddd %zmm1, %zmm2, %zmm3"',
149149
"stdio.h", "LINK_AVX512F"),
150+
("__asm__ volatile", '"vfpclasspd $0x40, %zmm15, %k6\\n"\
151+
"vmovdqu8 %xmm0, %xmm1\\n"\
152+
"vpbroadcastmb2q %k0, %xmm0\\n"',
153+
"stdio.h", "LINK_AVX512_SKX"),
150154
("__asm__ volatile", '"xgetbv"', "stdio.h", "XGETBV"),
151155
]
152156

@@ -165,6 +169,8 @@ def check_api_version(apiversion, codegen_dir):
165169
'attribute_target_avx2'),
166170
('__attribute__((target ("avx512f")))',
167171
'attribute_target_avx512f'),
172+
('__attribute__((target ("avx512f,avx512dq,avx512bw,avx512vl,avx512cd")))',
173+
'attribute_target_avx512_skx'),
168174
]
169175

170176
# function attributes with intrinsics
@@ -181,6 +187,11 @@ def check_api_version(apiversion, codegen_dir):
181187
'attribute_target_avx512f_with_intrinsics',
182188
'__m512 temp = _mm512_set1_ps(1.0)',
183189
'immintrin.h'),
190+
('__attribute__((target ("avx512f,avx512dq,avx512bw,avx512vl,avx512cd")))',
191+
'attribute_target_avx512_skx_with_intrinsics',
192+
'__mmask8 temp = _mm512_fpclass_pd_mask(_mm512_set1_pd(1.0), 0x01);\
193+
_mm_mask_storeu_epi8(NULL, 0xFF, _mm_broadcastmb_epi64(temp))',
194+
'immintrin.h'),
184195
]
185196

186197
# variable attributes tested via "int %s a" % attribute

numpy/core/src/umath/loops.c.src

+8-2
Original file line numberDiff line numberDiff line change
@@ -1863,17 +1863,23 @@ NPY_NO_EXPORT void
18631863
* #kind = isnan, isinf, isfinite, signbit#
18641864
* #func = npy_isnan, npy_isinf, npy_isfinite, npy_signbit#
18651865
**/
1866+
1867+
/**begin repeat2
1868+
* #ISA = , _avx512_skx#
1869+
* #isa = simd, avx512_skx#
1870+
**/
18661871
NPY_NO_EXPORT void
1867-
@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
1872+
@TYPE@_@kind@@ISA@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
18681873
{
1869-
if (!run_@kind@_simd_@TYPE@(args, dimensions, steps)) {
1874+
if (!run_@kind@_@isa@_@TYPE@(args, dimensions, steps)) {
18701875
UNARY_LOOP {
18711876
const @type@ in1 = *(@type@ *)ip1;
18721877
*((npy_bool *)op1) = @func@(in1) != 0;
18731878
}
18741879
}
18751880
npy_clear_floatstatus_barrier((char*)dimensions);
18761881
}
1882+
/**end repeat2**/
18771883
/**end repeat1**/
18781884

18791885
NPY_NO_EXPORT void

numpy/core/src/umath/loops.h.src

+6-1
Original file line numberDiff line numberDiff line change
@@ -274,8 +274,13 @@ NPY_NO_EXPORT void
274274
* #kind = isnan, isinf, isfinite, signbit, copysign, nextafter, spacing#
275275
* #func = npy_isnan, npy_isinf, npy_isfinite, npy_signbit, npy_copysign, nextafter, spacing#
276276
**/
277+
278+
/**begin repeat2
279+
* #ISA = , _avx512_skx#
280+
**/
277281
NPY_NO_EXPORT void
278-
@TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
282+
@TYPE@_@kind@@ISA@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
283+
/**end repeat2**/
279284
/**end repeat1**/
280285

281286
/**begin repeat1

numpy/core/src/umath/simd.inc.src

+114-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* -*- c -*- */
1+
22

33
/*
44
* This file is for the definitions of simd vectorized operations.
@@ -293,6 +293,40 @@ run_binary_avx512f_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_in
293293
}
294294

295295

296+
/**end repeat1**/
297+
/**end repeat**/
298+
299+
/**begin repeat
300+
* #type = npy_float, npy_double, npy_longdouble#
301+
* #TYPE = FLOAT, DOUBLE, LONGDOUBLE#
302+
* #EXISTS = 1, 1, 0#
303+
*/
304+
305+
/**begin repeat1
306+
* #func = isnan, isfinite, isinf, signbit#
307+
*/
308+
309+
#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
310+
static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void
311+
AVX512_SKX_@func@_@TYPE@(npy_bool*, @type@*, const npy_intp n, const npy_intp stride);
312+
#endif
313+
314+
static NPY_INLINE int
315+
run_@func@_avx512_skx_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
316+
{
317+
#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
318+
if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_bool), 64)) {
319+
AVX512_SKX_@func@_@TYPE@((npy_bool*)args[1], (@type@*)args[0], dimensions[0], steps[0]);
320+
return 1;
321+
}
322+
else {
323+
return 0;
324+
}
325+
#endif
326+
return 0;
327+
}
328+
329+
296330
/**end repeat1**/
297331
/**end repeat**/
298332

@@ -1971,6 +2005,84 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d
19712005
#endif
19722006
/**end repeat**/
19732007

2008+
/**begin repeat
2009+
* #type = npy_float, npy_double#
2010+
* #TYPE = FLOAT, DOUBLE#
2011+
* #num_lanes = 16, 8#
2012+
* #vsuffix = ps, pd#
2013+
* #mask = __mmask16, __mmask8#
2014+
* #vtype = __m512, __m512d#
2015+
* #scale = 4, 8#
2016+
* #vindextype = __m512i, __m256i#
2017+
* #vindexload = _mm512_loadu_si512, _mm256_loadu_si256#
2018+
* #episize = epi32, epi64#
2019+
*/
2020+
2021+
/**begin repeat1
2022+
* #func = isnan, isfinite, isinf, signbit#
2023+
* #IMM8 = 0x81, 0x99, 0x18, 0x04#
2024+
* #is_finite = 0, 1, 0, 0#
2025+
* #is_signbit = 0, 0, 0, 1#
2026+
*/
2027+
#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
2028+
static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void
2029+
AVX512_SKX_@func@_@TYPE@(npy_bool* op, @type@* ip, const npy_intp array_size, const npy_intp steps)
2030+
{
2031+
const npy_intp stride_ip = steps/(npy_intp)sizeof(@type@);
2032+
npy_intp num_remaining_elements = array_size;
2033+
2034+
@mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
2035+
#if @is_signbit@
2036+
@vtype@ signbit = _mm512_set1_@vsuffix@(-0.0);
2037+
#endif
2038+
2039+
/*
2040+
* Note: while generally indices are npy_intp, we ensure that our maximum
2041+
* index will fit in an int32 as a precondition for this function via
2042+
* IS_OUTPUT_BLOCKABLE_UNARY
2043+
*/
2044+
2045+
npy_int32 index_ip[@num_lanes@];
2046+
for (npy_int32 ii = 0; ii < @num_lanes@; ii++) {
2047+
index_ip[ii] = ii*stride_ip;
2048+
}
2049+
@vindextype@ vindex_ip = @vindexload@((@vindextype@*)&index_ip[0]);
2050+
@vtype@ zeros_f = _mm512_setzero_@vsuffix@();
2051+
__m512i ones = _mm512_set1_@episize@(1);
2052+
2053+
while (num_remaining_elements > 0) {
2054+
if (num_remaining_elements < @num_lanes@) {
2055+
load_mask = avx512_get_partial_load_mask_@vsuffix@(
2056+
num_remaining_elements, @num_lanes@);
2057+
}
2058+
@vtype@ x1;
2059+
if (stride_ip == 1) {
2060+
x1 = avx512_masked_load_@vsuffix@(load_mask, ip);
2061+
}
2062+
else {
2063+
x1 = avx512_masked_gather_@vsuffix@(zeros_f, ip, vindex_ip, load_mask);
2064+
}
2065+
#if @is_signbit@
2066+
x1 = _mm512_and_@vsuffix@(x1,signbit);
2067+
#endif
2068+
2069+
@mask@ fpclassmask = _mm512_fpclass_@vsuffix@_mask(x1, @IMM8@);
2070+
#if @is_finite@
2071+
fpclassmask = _mm512_knot(fpclassmask);
2072+
#endif
2073+
2074+
__m128i out =_mm512_maskz_cvts@episize@_epi8(fpclassmask, ones);
2075+
_mm_mask_storeu_epi8(op, load_mask, out);
2076+
2077+
ip += @num_lanes@*stride_ip;
2078+
op += @num_lanes@;
2079+
num_remaining_elements -= @num_lanes@;
2080+
}
2081+
}
2082+
#endif
2083+
/**end repeat1**/
2084+
/**end repeat**/
2085+
19742086
/**begin repeat
19752087
* #type = npy_float, npy_double#
19762088
* #TYPE = FLOAT, DOUBLE#
@@ -2064,8 +2176,8 @@ AVX512F_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *s
20642176
}
20652177
}
20662178
#endif
2067-
/**end repeat**/
20682179
/**end repeat1**/
2180+
/**end repeat**/
20692181

20702182
/**begin repeat
20712183
* #ISA = FMA, AVX512F#

numpy/core/tests/test_umath.py

+18
Original file line numberDiff line numberDiff line change
@@ -771,6 +771,24 @@ def test_reciprocal_values(self):
771771
for dt in ['f', 'd', 'g']:
772772
assert_raises(FloatingPointError, np.reciprocal, np.array(-0.0, dtype=dt))
773773

774+
class TestFPClass:
775+
@pytest.mark.parametrize("stride", [-4,-2,-1,1,2,4])
776+
def test_fpclass(self, stride):
777+
arr_f64 = np.array([np.nan, -np.nan, np.inf, -np.inf, -1.0, 1.0, -0.0, 0.0, 2.2251e-308, -2.2251e-308], dtype='d')
778+
arr_f32 = np.array([np.nan, -np.nan, np.inf, -np.inf, -1.0, 1.0, -0.0, 0.0, 1.4013e-045, -1.4013e-045], dtype='f')
779+
nan = np.array([True, True, False, False, False, False, False, False, False, False])
780+
inf = np.array([False, False, True, True, False, False, False, False, False, False])
781+
sign = np.array([False, True, False, True, True, False, True, False, False, True])
782+
finite = np.array([False, False, False, False, True, True, True, True, True, True])
783+
assert_equal(np.isnan(arr_f32[::stride]), nan[::stride])
784+
assert_equal(np.isnan(arr_f64[::stride]), nan[::stride])
785+
assert_equal(np.isinf(arr_f32[::stride]), inf[::stride])
786+
assert_equal(np.isinf(arr_f64[::stride]), inf[::stride])
787+
assert_equal(np.signbit(arr_f32[::stride]), sign[::stride])
788+
assert_equal(np.signbit(arr_f64[::stride]), sign[::stride])
789+
assert_equal(np.isfinite(arr_f32[::stride]), finite[::stride])
790+
assert_equal(np.isfinite(arr_f64[::stride]), finite[::stride])
791+
774792
# func : [maxulperror, low, high]
775793
avx_ufuncs = {'sqrt' :[1, 0., 100.],
776794
'absolute' :[0, -100., 100.],

0 commit comments

Comments
 (0)