@@ -120,6 +120,13 @@ nomemoverlap(char *ip,
120
120
(nomemoverlap(args[0], steps[0] * dimensions[0], args[2], steps[2] * dimensions[0])) && \
121
121
(nomemoverlap(args[1], steps[1] * dimensions[0], args[2], steps[2] * dimensions[0])))
122
122
123
+ #define IS_UNARY_TWO_OUT_SMALL_STEPS_AND_NOMEMOVERLAP \
124
+ ((abs(steps[0]) < MAX_STEP_SIZE) && \
125
+ (abs(steps[1]) < MAX_STEP_SIZE) && \
126
+ (abs(steps[2]) < MAX_STEP_SIZE) && \
127
+ (nomemoverlap(args[0], steps[0] * dimensions[0], args[2], steps[2] * dimensions[0])) && \
128
+ (nomemoverlap(args[0], steps[0] * dimensions[0], args[1], steps[1] * dimensions[0])))
129
+
123
130
/*
124
131
* 1) Output should be contiguous, can handle strided input data
125
132
* 2) Input step should be smaller than MAX_STEP_SIZE for performance
@@ -294,6 +301,42 @@ run_binary_avx512f_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_in
294
301
295
302
296
303
/**end repeat1**/
304
+
305
+ #if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
306
+ static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void
307
+ AVX512_SKX_ldexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps);
308
+
309
+ static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void
310
+ AVX512_SKX_frexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps);
311
+ #endif
312
+
313
+ static NPY_INLINE int
314
+ run_binary_avx512_skx_ldexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
315
+ {
316
+ #if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
317
+ if (IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP) {
318
+ AVX512_SKX_ldexp_@TYPE@(args, dimensions, steps);
319
+ return 1;
320
+ }
321
+ else
322
+ return 0;
323
+ #endif
324
+ return 0;
325
+ }
326
+
327
+ static NPY_INLINE int
328
+ run_unary_two_out_avx512_skx_frexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
329
+ {
330
+ #if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@
331
+ if (IS_UNARY_TWO_OUT_SMALL_STEPS_AND_NOMEMOVERLAP) {
332
+ AVX512_SKX_frexp_@TYPE@(args, dimensions, steps);
333
+ return 1;
334
+ }
335
+ else
336
+ return 0;
337
+ #endif
338
+ return 0;
339
+ }
297
340
/**end repeat**/
298
341
299
342
/**begin repeat
@@ -2089,13 +2132,167 @@ AVX512_SKX_@func@_@TYPE@(npy_bool* op, @type@* ip, const npy_intp array_size, co
2089
2132
* #num_lanes = 16, 8#
2090
2133
* #vsuffix = ps, pd#
2091
2134
* #mask = __mmask16, __mmask8#
2092
- * #vtype = __m512, __m512d#
2135
+ * #vtype1 = __m512, __m512d#
2136
+ * #vtype2 = __m512i, __m256i#
2093
2137
* #scale = 4, 8#
2094
2138
* #vindextype = __m512i, __m256i#
2095
2139
* #vindexsize = 512, 256#
2096
2140
* #vindexload = _mm512_loadu_si512, _mm256_loadu_si256#
2141
+ * #vtype2_load = _mm512_maskz_loadu_epi32, _mm256_maskz_loadu_epi32#
2142
+ * #vtype2_gather = _mm512_mask_i32gather_epi32, _mm256_mmask_i32gather_epi32#
2143
+ * #vtype2_store = _mm512_mask_storeu_epi32, _mm256_mask_storeu_epi32#
2144
+ * #vtype2_scatter = _mm512_mask_i32scatter_epi32, _mm256_mask_i32scatter_epi32#
2145
+ * #setzero = _mm512_setzero_epi32, _mm256_setzero_si256#
2097
2146
*/
2098
2147
2148
+ #if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
2149
+ static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void
2150
+ AVX512_SKX_ldexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
2151
+ {
2152
+ const npy_intp stride_ip1 = steps[0]/(npy_intp)sizeof(@type@);
2153
+ const npy_intp stride_ip2 = steps[1]/(npy_intp)sizeof(int);
2154
+ const npy_intp stride_op = steps[2]/(npy_intp)sizeof(@type@);
2155
+ const npy_intp array_size = dimensions[0];
2156
+ npy_intp num_remaining_elements = array_size;
2157
+ @type@* ip1 = (@type@*) args[0];
2158
+ int* ip2 = (int*) args[1];
2159
+ @type@* op = (@type@*) args[2];
2160
+
2161
+ @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
2162
+
2163
+ /*
2164
+ * Note: while generally indices are npy_intp, we ensure that our maximum index
2165
+ * will fit in an int32 as a precondition for this function via
2166
+ * IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP
2167
+ */
2168
+
2169
+ npy_int32 index_ip1[@num_lanes@], index_ip2[@num_lanes@], index_op[@num_lanes@];
2170
+ for (npy_int32 ii = 0; ii < @num_lanes@; ii++) {
2171
+ index_ip1[ii] = ii*stride_ip1;
2172
+ index_ip2[ii] = ii*stride_ip2;
2173
+ index_op[ii] = ii*stride_op;
2174
+ }
2175
+ @vindextype@ vindex_ip1 = @vindexload@((@vindextype@*)&index_ip1[0]);
2176
+ @vindextype@ vindex_ip2 = @vindexload@((@vindextype@*)&index_ip2[0]);
2177
+ @vindextype@ vindex_op = @vindexload@((@vindextype@*)&index_op[0]);
2178
+ @vtype1@ zeros_f = _mm512_setzero_@vsuffix@();
2179
+ @vtype2@ zeros = @setzero@();
2180
+
2181
+ while (num_remaining_elements > 0) {
2182
+ if (num_remaining_elements < @num_lanes@) {
2183
+ load_mask = avx512_get_partial_load_mask_@vsuffix@(
2184
+ num_remaining_elements, @num_lanes@);
2185
+ }
2186
+ @vtype1@ x1;
2187
+ @vtype2@ x2;
2188
+ if (stride_ip1 == 1) {
2189
+ x1 = avx512_masked_load_@vsuffix@(load_mask, ip1);
2190
+ }
2191
+ else {
2192
+ x1 = avx512_masked_gather_@vsuffix@(zeros_f, ip1, vindex_ip1, load_mask);
2193
+ }
2194
+ if (stride_ip2 == 1) {
2195
+ x2 = @vtype2_load@(load_mask, ip2);
2196
+ }
2197
+ else {
2198
+ x2 = @vtype2_gather@(zeros, load_mask, vindex_ip2, ip2, 4);
2199
+ }
2200
+
2201
+ @vtype1@ out = _mm512_scalef_@vsuffix@(x1, _mm512_cvtepi32_@vsuffix@(x2));
2202
+
2203
+ if (stride_op == 1) {
2204
+ _mm512_mask_storeu_@vsuffix@(op, load_mask, out);
2205
+ }
2206
+ else {
2207
+ /* scatter! */
2208
+ _mm512_mask_i32scatter_@vsuffix@(op, load_mask, vindex_op, out, @scale@);
2209
+ }
2210
+
2211
+ ip1 += @num_lanes@*stride_ip1;
2212
+ ip2 += @num_lanes@*stride_ip2;
2213
+ op += @num_lanes@*stride_op;
2214
+ num_remaining_elements -= @num_lanes@;
2215
+ }
2216
+ }
2217
+
2218
+ static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void
2219
+ AVX512_SKX_frexp_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps)
2220
+ {
2221
+ const npy_intp stride_ip1 = steps[0]/(npy_intp)sizeof(@type@);
2222
+ const npy_intp stride_op1 = steps[1]/(npy_intp)sizeof(@type@);
2223
+ const npy_intp stride_op2 = steps[2]/(npy_intp)sizeof(int);
2224
+ const npy_intp array_size = dimensions[0];
2225
+ npy_intp num_remaining_elements = array_size;
2226
+ @type@* ip1 = (@type@*) args[0];
2227
+ @type@* op1 = (@type@*) args[1];
2228
+ int* op2 = (int*) args[2];
2229
+
2230
+ @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@();
2231
+
2232
+ /*
2233
+ * Note: while generally indices are npy_intp, we ensure that our maximum index
2234
+ * will fit in an int32 as a precondition for this function via
2235
+ * IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP
2236
+ */
2237
+
2238
+ npy_int32 index_ip1[@num_lanes@], index_op1[@num_lanes@], index_op2[@num_lanes@];
2239
+ for (npy_int32 ii = 0; ii < @num_lanes@; ii++) {
2240
+ index_ip1[ii] = ii*stride_ip1;
2241
+ index_op1[ii] = ii*stride_op1;
2242
+ index_op2[ii] = ii*stride_op2;
2243
+ }
2244
+ @vindextype@ vindex_ip1 = @vindexload@((@vindextype@*)&index_ip1[0]);
2245
+ @vindextype@ vindex_op1 = @vindexload@((@vindextype@*)&index_op1[0]);
2246
+ @vindextype@ vindex_op2 = @vindexload@((@vindextype@*)&index_op2[0]);
2247
+ @vtype1@ zeros_f = _mm512_setzero_@vsuffix@();
2248
+
2249
+ while (num_remaining_elements > 0) {
2250
+ if (num_remaining_elements < @num_lanes@) {
2251
+ load_mask = avx512_get_partial_load_mask_@vsuffix@(
2252
+ num_remaining_elements, @num_lanes@);
2253
+ }
2254
+ @vtype1@ x1;
2255
+ if (stride_ip1 == 1) {
2256
+ x1 = avx512_masked_load_@vsuffix@(load_mask, ip1);
2257
+ }
2258
+ else {
2259
+ x1 = avx512_masked_gather_@vsuffix@(zeros_f, ip1, vindex_ip1, load_mask);
2260
+ }
2261
+
2262
+ /*
2263
+ * The x86 instructions vpgetmant and vpgetexp do not conform
2264
+ * with NumPy's output for special floating points: NAN, +/-INF, +/-0.0
2265
+ * We mask these values with spmask to avoid invalid exceptions.
2266
+ */
2267
+ @mask@ spmask =_mm512_knot(_mm512_fpclass_@vsuffix@_mask(
2268
+ x1, 0b10011111));
2269
+ @vtype1@ out1 = _mm512_maskz_getmant_@vsuffix@(
2270
+ spmask, x1, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src);
2271
+ out1 = _mm512_mask_mov_@vsuffix@(x1, spmask, out1);
2272
+ @vtype2@ out2 = _mm512_cvt@vsuffix@_epi32(
2273
+ _mm512_maskz_add_@vsuffix@(spmask, _mm512_set1_@vsuffix@(1.0),
2274
+ _mm512_maskz_getexp_@vsuffix@(spmask, x1)));
2275
+ if (stride_op1 == 1) {
2276
+ _mm512_mask_storeu_@vsuffix@(op1, load_mask, out1);
2277
+ }
2278
+ else {
2279
+ _mm512_mask_i32scatter_@vsuffix@(op1, load_mask, vindex_op1, out1, @scale@);
2280
+ }
2281
+ if (stride_op2 == 1) {
2282
+ @vtype2_store@(op2, load_mask, out2);
2283
+ }
2284
+ else {
2285
+ @vtype2_scatter@(op2, load_mask, vindex_op2, out2, 4);
2286
+ }
2287
+
2288
+ ip1 += @num_lanes@*stride_ip1;
2289
+ op1 += @num_lanes@*stride_op1;
2290
+ op2 += @num_lanes@*stride_op2;
2291
+ num_remaining_elements -= @num_lanes@;
2292
+ }
2293
+ }
2294
+ #endif
2295
+
2099
2296
/**begin repeat1
2100
2297
* #func = maximum, minimum#
2101
2298
* #vectorf = max, min#
@@ -2131,14 +2328,14 @@ AVX512F_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *s
2131
2328
@vindextype@ vindex_ip1 = @vindexload@((@vindextype@*)&index_ip1[0]);
2132
2329
@vindextype@ vindex_ip2 = @vindexload@((@vindextype@*)&index_ip2[0]);
2133
2330
@vindextype@ vindex_op = @vindexload@((@vindextype@*)&index_op[0]);
2134
- @vtype @ zeros_f = _mm512_setzero_@vsuffix@();
2331
+ @vtype1 @ zeros_f = _mm512_setzero_@vsuffix@();
2135
2332
2136
2333
while (num_remaining_elements > 0) {
2137
2334
if (num_remaining_elements < @num_lanes@) {
2138
2335
load_mask = avx512_get_partial_load_mask_@vsuffix@(
2139
2336
num_remaining_elements, @num_lanes@);
2140
2337
}
2141
- @vtype @ x1, x2;
2338
+ @vtype1 @ x1, x2;
2142
2339
if (stride_ip1 == 1) {
2143
2340
x1 = avx512_masked_load_@vsuffix@(load_mask, ip1);
2144
2341
}
@@ -2158,7 +2355,7 @@ AVX512F_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *s
2158
2355
* this issue to conform with NumPy behaviour.
2159
2356
*/
2160
2357
@mask@ nan_mask = _mm512_cmp_@vsuffix@_mask(x1, x1, _CMP_NEQ_UQ);
2161
- @vtype @ out = _mm512_@vectorf@_@vsuffix@(x1, x2);
2358
+ @vtype1 @ out = _mm512_@vectorf@_@vsuffix@(x1, x2);
2162
2359
out = _mm512_mask_blend_@vsuffix@(nan_mask, out, x1);
2163
2360
2164
2361
if (stride_op == 1) {
0 commit comments