|
1 |
| -/* -*- c -*- */ |
| 1 | + |
2 | 2 |
|
3 | 3 | /*
|
4 | 4 | * This file is for the definitions of simd vectorized operations.
|
@@ -293,6 +293,40 @@ run_binary_avx512f_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_in
|
293 | 293 | }
|
294 | 294 |
|
295 | 295 |
|
| 296 | +/**end repeat1**/ |
| 297 | +/**end repeat**/ |
| 298 | + |
| 299 | +/**begin repeat |
| 300 | + * #type = npy_float, npy_double, npy_longdouble# |
| 301 | + * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# |
| 302 | + * #EXISTS = 1, 1, 0# |
| 303 | + */ |
| 304 | + |
| 305 | +/**begin repeat1 |
| 306 | + * #func = isnan, isfinite, isinf, signbit# |
| 307 | + */ |
| 308 | + |
| 309 | +#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@ |
| 310 | +static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void |
| 311 | +AVX512_SKX_@func@_@TYPE@(npy_bool*, @type@*, const npy_intp n, const npy_intp stride); |
| 312 | +#endif |
| 313 | + |
| 314 | +static NPY_INLINE int |
| 315 | +run_@func@_avx512_skx_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps) |
| 316 | +{ |
| 317 | +#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@ |
| 318 | + if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_bool), 64)) { |
| 319 | + AVX512_SKX_@func@_@TYPE@((npy_bool*)args[1], (@type@*)args[0], dimensions[0], steps[0]); |
| 320 | + return 1; |
| 321 | + } |
| 322 | + else { |
| 323 | + return 0; |
| 324 | + } |
| 325 | +#endif |
| 326 | + return 0; |
| 327 | +} |
| 328 | + |
| 329 | + |
296 | 330 | /**end repeat1**/
|
297 | 331 | /**end repeat**/
|
298 | 332 |
|
@@ -1971,6 +2005,84 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d
|
1971 | 2005 | #endif
|
1972 | 2006 | /**end repeat**/
|
1973 | 2007 |
|
| 2008 | +/**begin repeat |
| 2009 | + * #type = npy_float, npy_double# |
| 2010 | + * #TYPE = FLOAT, DOUBLE# |
| 2011 | + * #num_lanes = 16, 8# |
| 2012 | + * #vsuffix = ps, pd# |
| 2013 | + * #mask = __mmask16, __mmask8# |
| 2014 | + * #vtype = __m512, __m512d# |
| 2015 | + * #scale = 4, 8# |
| 2016 | + * #vindextype = __m512i, __m256i# |
| 2017 | + * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256# |
| 2018 | + * #episize = epi32, epi64# |
| 2019 | + */ |
| 2020 | + |
| 2021 | +/**begin repeat1 |
| 2022 | + * #func = isnan, isfinite, isinf, signbit# |
| 2023 | + * #IMM8 = 0x81, 0x99, 0x18, 0x04# |
| 2024 | + * #is_finite = 0, 1, 0, 0# |
| 2025 | + * #is_signbit = 0, 0, 0, 1# |
| 2026 | + */ |
| 2027 | +#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS |
| 2028 | +static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void |
| 2029 | +AVX512_SKX_@func@_@TYPE@(npy_bool* op, @type@* ip, const npy_intp array_size, const npy_intp steps) |
| 2030 | +{ |
| 2031 | + const npy_intp stride_ip = steps/(npy_intp)sizeof(@type@); |
| 2032 | + npy_intp num_remaining_elements = array_size; |
| 2033 | + |
| 2034 | + @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@(); |
| 2035 | +#if @is_signbit@ |
| 2036 | + @vtype@ signbit = _mm512_set1_@vsuffix@(-0.0); |
| 2037 | +#endif |
| 2038 | + |
| 2039 | + /* |
| 2040 | + * Note: while generally indices are npy_intp, we ensure that our maximum |
| 2041 | + * index will fit in an int32 as a precondition for this function via |
| 2042 | + * IS_OUTPUT_BLOCKABLE_UNARY |
| 2043 | + */ |
| 2044 | + |
| 2045 | + npy_int32 index_ip[@num_lanes@]; |
| 2046 | + for (npy_int32 ii = 0; ii < @num_lanes@; ii++) { |
| 2047 | + index_ip[ii] = ii*stride_ip; |
| 2048 | + } |
| 2049 | + @vindextype@ vindex_ip = @vindexload@((@vindextype@*)&index_ip[0]); |
| 2050 | + @vtype@ zeros_f = _mm512_setzero_@vsuffix@(); |
| 2051 | + __m512i ones = _mm512_set1_@episize@(1); |
| 2052 | + |
| 2053 | + while (num_remaining_elements > 0) { |
| 2054 | + if (num_remaining_elements < @num_lanes@) { |
| 2055 | + load_mask = avx512_get_partial_load_mask_@vsuffix@( |
| 2056 | + num_remaining_elements, @num_lanes@); |
| 2057 | + } |
| 2058 | + @vtype@ x1; |
| 2059 | + if (stride_ip == 1) { |
| 2060 | + x1 = avx512_masked_load_@vsuffix@(load_mask, ip); |
| 2061 | + } |
| 2062 | + else { |
| 2063 | + x1 = avx512_masked_gather_@vsuffix@(zeros_f, ip, vindex_ip, load_mask); |
| 2064 | + } |
| 2065 | +#if @is_signbit@ |
| 2066 | + x1 = _mm512_and_@vsuffix@(x1,signbit); |
| 2067 | +#endif |
| 2068 | + |
| 2069 | + @mask@ fpclassmask = _mm512_fpclass_@vsuffix@_mask(x1, @IMM8@); |
| 2070 | +#if @is_finite@ |
| 2071 | + fpclassmask = _mm512_knot(fpclassmask); |
| 2072 | +#endif |
| 2073 | + |
| 2074 | + __m128i out =_mm512_maskz_cvts@episize@_epi8(fpclassmask, ones); |
| 2075 | + _mm_mask_storeu_epi8(op, load_mask, out); |
| 2076 | + |
| 2077 | + ip += @num_lanes@*stride_ip; |
| 2078 | + op += @num_lanes@; |
| 2079 | + num_remaining_elements -= @num_lanes@; |
| 2080 | + } |
| 2081 | +} |
| 2082 | +#endif |
| 2083 | +/**end repeat1**/ |
| 2084 | +/**end repeat**/ |
| 2085 | + |
1974 | 2086 | /**begin repeat
|
1975 | 2087 | * #type = npy_float, npy_double#
|
1976 | 2088 | * #TYPE = FLOAT, DOUBLE#
|
@@ -2064,8 +2176,8 @@ AVX512F_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *s
|
2064 | 2176 | }
|
2065 | 2177 | }
|
2066 | 2178 | #endif
|
2067 |
| -/**end repeat**/ |
2068 | 2179 | /**end repeat1**/
|
| 2180 | +/**end repeat**/ |
2069 | 2181 |
|
2070 | 2182 | /**begin repeat
|
2071 | 2183 | * #ISA = FMA, AVX512F#
|
|
0 commit comments