@@ -2206,9 +2206,6 @@ count_nonzero_bytes(const npy_uint8 *d, npy_uintp unrollx)
2206
2206
return unrollx - zero_count ;
2207
2207
}
2208
2208
2209
- #define MAX (x , y ) (((x) > (y)) ? (x) : (y))
2210
- #define MIN (x , y ) (((x) < (y)) ? (x) : (y))
2211
-
2212
2209
2213
2210
static NPY_INLINE NPY_GCC_OPT_3 npy_uintp
2214
2211
count_nonzero_int16_simd (npy_int16 * d , npy_uintp unrollx )
@@ -2225,7 +2222,7 @@ count_nonzero_int16_simd(npy_int16 *d, npy_uintp unrollx)
2225
2222
2226
2223
while (d < end ) {
2227
2224
npyv_u16 vsum16 = npyv_zero_u16 ();
2228
- target = MIN (target + innerloop_jump , end );
2225
+ target = PyArray_MIN (target + innerloop_jump , end );
2229
2226
for (; d < target ; d += npyv_nlanes_u16 ) {
2230
2227
npyv_u16 vt = npyv_cvt_u16_b16 (npyv_cmpeq_u16 (npyv_load_u16 (d ), vzero ));
2231
2228
vt = npyv_and_u16 (vt , vone );
@@ -2255,7 +2252,7 @@ count_nonzero_int32_simd(npy_int32 *d, npy_uintp unrollx)
2255
2252
npy_int32 * target = d ;
2256
2253
while (d < end ) {
2257
2254
npyv_u32 vsum32 = npyv_zero_u32 ();
2258
- target = MIN (target + innerloop_jump , end );
2255
+ target = PyArray_MIN (target + innerloop_jump , end );
2259
2256
for (; d < target ; d += npyv_nlanes_u32 ) {
2260
2257
npyv_u32 vt = npyv_cvt_u32_b32 (npyv_cmpeq_u32 (npyv_load_u32 (d ), vzero ));
2261
2258
vt = npyv_and_u32 (vt , vone );
@@ -2294,60 +2291,9 @@ count_nonzero_int64_simd(npy_int64 *d, npy_uintp unrollx)
2294
2291
2295
2292
#endif
2296
2293
2297
- static NPY_INLINE NPY_GCC_OPT_3 npy_intp
2298
- count_nonzero_int16 (int ndim , const npy_int16 * data , const npy_intp * ashape , const npy_intp * astrides )
2299
- {
2300
- int idim ;
2301
- npy_intp shape [NPY_MAXDIMS ], strides [NPY_MAXDIMS ];
2302
- npy_intp coord [NPY_MAXDIMS ];
2303
- npy_intp count = 0 ;
2304
- NPY_BEGIN_THREADS_DEF ;
2305
-
2306
- /* Use raw iteration with no heap memory allocation */
2307
- if (PyArray_PrepareOneRawArrayIter (
2308
- ndim , ashape ,
2309
- data , astrides ,
2310
- & ndim , shape ,
2311
- & data , strides ) < 0 ) {
2312
- return -1 ;
2313
- }
2314
-
2315
- /* Handle zero-sized array */
2316
- if (shape [0 ] == 0 ) {
2317
- return 0 ;
2318
- }
2319
-
2320
- NPY_BEGIN_THREADS_THRESHOLDED (shape [0 ]);
2321
- if (strides [0 ] == 2 ) {
2322
- NPY_RAW_ITER_START (idim , ndim , coord , shape ) {
2323
- /* Process the innermost dimension */
2324
- const npy_int16 * d = data ;
2325
- const npy_int16 * e = data + shape [0 ];
2326
- npy_uintp stride = shape [0 ] & - npyv_nlanes_u16 ;
2327
- count += count_nonzero_int16_simd (d , stride );
2328
- d += stride ;
2329
- for (; d < e ; ++ d ) {
2330
- count += (* d != 0 );
2331
- }
2332
- } NPY_RAW_ITER_ONE_NEXT (idim , ndim , coord , shape , data , strides );
2333
- } else {
2334
- NPY_RAW_ITER_START (idim , ndim , coord , shape ) {
2335
- npy_int16 * d = data ;
2336
- /* Process the innermost dimension */
2337
- for (npy_intp i = 0 ; i < shape [0 ]; ++ i , d = ((npy_int8 * ) d ) + strides [0 ]) {
2338
- count += (* d != 0 );
2339
- }
2340
- } NPY_RAW_ITER_ONE_NEXT (idim , ndim , coord , shape , data , strides );
2341
- }
2342
-
2343
- NPY_END_THREADS ;
2344
-
2345
- return count ;
2346
- }
2347
-
2348
2294
2349
2295
static NPY_INLINE NPY_GCC_OPT_3 npy_intp
2350
- count_nonzero_int32 (int ndim , const npy_int32 * data , const npy_intp * ashape , const npy_intp * astrides )
2296
+ count_nonzero_int (int ndim , void * data , const npy_intp * ashape , const npy_intp * astrides , int type_num )
2351
2297
{
2352
2298
int idim ;
2353
2299
npy_intp shape [NPY_MAXDIMS ], strides [NPY_MAXDIMS ];
@@ -2369,83 +2315,58 @@ count_nonzero_int32(int ndim, const npy_int32 *data, const npy_intp *ashape, con
2369
2315
return 0 ;
2370
2316
}
2371
2317
2372
- NPY_BEGIN_THREADS_THRESHOLDED (shape [0 ]);
2373
- if (strides [0 ] == 4 ) {
2374
- NPY_RAW_ITER_START (idim , ndim , coord , shape ) {
2375
- /* Process the innermost dimension */
2376
- const npy_int32 * d = data ;
2377
- const npy_int32 * e = data + shape [0 ];
2378
- npy_uintp stride = shape [0 ] & - npyv_nlanes_u32 ;
2379
- count += count_nonzero_int32_simd (d , stride );
2380
- d += stride ;
2381
- for (; d < e ; ++ d ) {
2382
- count += (* d != 0 );
2383
- }
2384
- } NPY_RAW_ITER_ONE_NEXT (idim , ndim , coord , shape , data , strides );
2385
- } else {
2386
- NPY_RAW_ITER_START (idim , ndim , coord , shape ) {
2387
- npy_int32 * d = data ;
2388
- /* Process the innermost dimension */
2389
- for (npy_intp i = 0 ; i < shape [0 ]; ++ i , d = ((npy_int8 * ) d ) + strides [0 ]) {
2390
- count += (* d != 0 );
2391
- }
2392
- } NPY_RAW_ITER_ONE_NEXT (idim , ndim , coord , shape , data , strides );
2393
- }
2394
-
2395
- NPY_END_THREADS ;
2396
2318
2397
- return count ;
2398
- }
2319
+ #define _ITERATE_INT_SIMPLE (bits ) \
2320
+ npy_int##bits *d = (npy_int##bits *) data; \
2321
+ NPY_RAW_ITER_START(idim, ndim, coord, shape) { \
2322
+ /* Process the innermost dimension */ \
2323
+ for (npy_intp i = 0 ; i < shape [0 ]; ++ i , d = ((npy_int8 * ) d ) + strides [0 ]) { \
2324
+ count += (* d != 0 ); \
2325
+ } \
2326
+ d = (npy_int ##bits *) data; \
2327
+ } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, d, strides);
2328
+
2329
+ #define _ITERATE_INT (bits , bytes ) \
2330
+ if (strides[0] == bytes) { \
2331
+ npy_int##bits *d2 = (npy_int##bits *) data; \
2332
+ NPY_RAW_ITER_START(idim, ndim, coord, shape) { \
2333
+ /* Process the innermost dimension */ \
2334
+ const npy_int ##bits *d = (npy_int##bits *) data; \
2335
+ const npy_int##bits *e = ((npy_int##bits *) data) + shape[0]; \
2336
+ npy_uintp stride = shape[0] & -npyv_nlanes_u##bits; \
2337
+ count += count_nonzero_int##bits##_simd(d, stride); \
2338
+ d += stride; \
2339
+ for (; d < e; ++d) { \
2340
+ count += (*d != 0); \
2341
+ } \
2342
+ } NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, d2, strides); \
2343
+ } else { \
2344
+ _ITERATE_INT_SIMPLE(bits) \
2345
+ }
2399
2346
2347
+ #if NPY_SIMD
2348
+ #define _ITERATE_I16 _ITERATE_INT(16, 2)
2349
+ #define _ITERATE_I32 _ITERATE_INT(32, 4)
2350
+ #define _ITERATE_I64 _ITERATE_INT(64, 8)
2351
+ #else
2352
+ #define _ITERATE_I16 _ITERATE_INT_SIMPLE(16)
2353
+ #define _ITERATE_I32 _ITERATE_INT_SIMPLE(32)
2354
+ #define _ITERATE_I64 _ITERATE_INT_SIMPLE(64)
2355
+ #endif
2400
2356
2401
- static NPY_INLINE NPY_GCC_OPT_3 npy_intp
2402
- count_nonzero_int64 (int ndim , const npy_int64 * data , const npy_intp * ashape , const npy_intp * astrides )
2403
- {
2404
- int idim ;
2405
- npy_intp shape [NPY_MAXDIMS ], strides [NPY_MAXDIMS ];
2406
- npy_intp coord [NPY_MAXDIMS ];
2407
- npy_intp count = 0 ;
2408
- NPY_BEGIN_THREADS_DEF ;
2357
+ NPY_BEGIN_THREADS_THRESHOLDED (shape [0 ]);
2409
2358
2410
- /* Use raw iteration with no heap memory allocation */
2411
- if (PyArray_PrepareOneRawArrayIter (
2412
- ndim , ashape ,
2413
- data , astrides ,
2414
- & ndim , shape ,
2415
- & data , strides ) < 0 ) {
2416
- return -1 ;
2359
+ if (type_num == NPY_INT16 || type_num == NPY_UINT16 ) {
2360
+ _ITERATE_I16 ;
2417
2361
}
2418
-
2419
- /* Handle zero-sized array */
2420
- if (shape [0 ] == 0 ) {
2421
- return 0 ;
2362
+ else if (type_num == NPY_INT32 || type_num == NPY_UINT32 ) {
2363
+ _ITERATE_I32 ;
2422
2364
}
2423
-
2424
- NPY_BEGIN_THREADS_THRESHOLDED (shape [0 ]);
2425
-
2426
- if (strides [0 ] == 8 ) {
2427
- NPY_RAW_ITER_START (idim , ndim , coord , shape ) {
2428
- /* Process the innermost dimension */
2429
- const npy_int64 * d = data ;
2430
- const npy_int64 * e = data + shape [0 ];
2431
- npy_uintp stride = shape [0 ] & - npyv_nlanes_u64 ;
2432
- count += count_nonzero_int64_simd (d , stride );
2433
- d += stride ;
2434
- for (; d < e ; ++ d ) {
2435
- count += (* d != 0 );
2436
- }
2437
- } NPY_RAW_ITER_ONE_NEXT (idim , ndim , coord , shape , data , strides );
2438
- } else {
2439
- NPY_RAW_ITER_START (idim , ndim , coord , shape ) {
2440
- npy_int64 * d = data ;
2441
- /* Process the innermost dimension */
2442
- for (npy_intp i = 0 ; i < shape [0 ]; ++ i , d = ((npy_int8 * ) d ) + strides [0 ]) {
2443
- count += (* d != 0 );
2444
- }
2445
- } NPY_RAW_ITER_ONE_NEXT (idim , ndim , coord , shape , data , strides );
2365
+ else if (type_num == NPY_INT64 || type_num == NPY_UINT64 ) {
2366
+ _ITERATE_I64 ;
2446
2367
}
2447
2368
2448
- NPY_END_THREADS ;
2369
+ NPY_END_THREADS ;
2449
2370
2450
2371
return count ;
2451
2372
}
@@ -2547,23 +2468,28 @@ PyArray_CountNonzero(PyArrayObject *self)
2547
2468
dtype = PyArray_DESCR (self );
2548
2469
2549
2470
2550
- #if NPY_SIMD
2551
- if (dtype -> type_num == NPY_INT16 || dtype -> type_num == NPY_UINT16 ) {
2552
- return count_nonzero_int16 (PyArray_NDIM (self ), (npy_int16 * ) PyArray_DATA (self ),
2553
- PyArray_DIMS (self ), PyArray_STRIDES (self ));
2554
- }
2471
+ // #if NPY_SIMD
2472
+ // if (dtype->type_num == NPY_INT16 || dtype->type_num == NPY_UINT16) {
2473
+ // return count_nonzero_int16(PyArray_NDIM(self), (npy_int16 *) PyArray_DATA(self),
2474
+ // PyArray_DIMS(self), PyArray_STRIDES(self));
2475
+ // }
2555
2476
2556
- if (dtype -> type_num == NPY_INT32 || dtype -> type_num == NPY_UINT32 ) {
2557
- return count_nonzero_int32 (PyArray_NDIM (self ), (npy_int32 * ) PyArray_DATA (self ),
2558
- PyArray_DIMS (self ), PyArray_STRIDES (self ));
2559
- }
2477
+ // if (dtype->type_num == NPY_INT32 || dtype->type_num == NPY_UINT32) {
2478
+ // return count_nonzero_int32(PyArray_NDIM(self), (npy_int32 *) PyArray_DATA(self),
2479
+ // PyArray_DIMS(self), PyArray_STRIDES(self));
2480
+ // }
2560
2481
2561
- if (dtype -> type_num == NPY_INT64 || dtype -> type_num == NPY_UINT64 ) {
2562
- return count_nonzero_int64 (PyArray_NDIM (self ), (npy_int64 * ) PyArray_DATA (self ),
2563
- PyArray_DIMS (self ), PyArray_STRIDES (self ));
2564
- }
2482
+ // if (dtype->type_num == NPY_INT64 || dtype->type_num == NPY_UINT64) {
2483
+ // return count_nonzero_int64(PyArray_NDIM(self), (npy_int64 *) PyArray_DATA(self),
2484
+ // PyArray_DIMS(self), PyArray_STRIDES(self));
2485
+ // }
2565
2486
2566
- #endif
2487
+ // #endif
2488
+
2489
+ if (dtype -> type_num >= NPY_INT16 && dtype -> type_num <= NPY_UINT64 ) {
2490
+ return count_nonzero_int (PyArray_NDIM (self ), (void * ) PyArray_DATA (self ),
2491
+ PyArray_DIMS (self ), PyArray_STRIDES (self ), dtype -> type_num );
2492
+ }
2567
2493
2568
2494
if (dtype -> type_num == NPY_BOOL || dtype -> type_num == NPY_INT8 || dtype -> type_num == NPY_UINT8 ) {
2569
2495
return count_boolean_trues (PyArray_NDIM (self ), PyArray_DATA (self ),
0 commit comments