Skip to content

Commit 15cf37d

Browse files
committed
Merged count_nonzero_int16/int32/int64 into count_nonzero_int and added benchmarks
1 parent c716a12 commit 15cf37d

File tree

2 files changed

+67
-141
lines changed

2 files changed

+67
-141
lines changed

benchmarks/benchmarks/bench_core.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ class CountNonzero(Benchmark):
136136
params = [
137137
[1, 2, 3],
138138
[100, 10000, 1000000],
139-
[bool, int, str, object]
139+
[bool, np.int8, np.int16, np.int32, np.int64, str, object]
140140
]
141141

142142
def setup(self, numaxes, size, dtype):

numpy/core/src/multiarray/item_selection.c

+66-140
Original file line numberDiff line numberDiff line change
@@ -2206,9 +2206,6 @@ count_nonzero_bytes(const npy_uint8 *d, npy_uintp unrollx)
22062206
return unrollx - zero_count;
22072207
}
22082208

2209-
#define MAX(x, y) (((x) > (y)) ? (x) : (y))
2210-
#define MIN(x, y) (((x) < (y)) ? (x) : (y))
2211-
22122209

22132210
static NPY_INLINE NPY_GCC_OPT_3 npy_uintp
22142211
count_nonzero_int16_simd(npy_int16 *d, npy_uintp unrollx)
@@ -2225,7 +2222,7 @@ count_nonzero_int16_simd(npy_int16 *d, npy_uintp unrollx)
22252222

22262223
while (d<end) {
22272224
npyv_u16 vsum16 = npyv_zero_u16();
2228-
target = MIN(target+innerloop_jump, end);
2225+
target = PyArray_MIN(target+innerloop_jump, end);
22292226
for (; d<target; d+=npyv_nlanes_u16) {
22302227
npyv_u16 vt = npyv_cvt_u16_b16(npyv_cmpeq_u16(npyv_load_u16(d), vzero));
22312228
vt = npyv_and_u16(vt, vone);
@@ -2255,7 +2252,7 @@ count_nonzero_int32_simd(npy_int32 *d, npy_uintp unrollx)
22552252
npy_int32 *target = d;
22562253
while (d<end) {
22572254
npyv_u32 vsum32 = npyv_zero_u32();
2258-
target = MIN(target+innerloop_jump, end);
2255+
target = PyArray_MIN(target+innerloop_jump, end);
22592256
for (; d<target; d+=npyv_nlanes_u32) {
22602257
npyv_u32 vt = npyv_cvt_u32_b32(npyv_cmpeq_u32(npyv_load_u32(d), vzero));
22612258
vt = npyv_and_u32(vt, vone);
@@ -2294,60 +2291,9 @@ count_nonzero_int64_simd(npy_int64 *d, npy_uintp unrollx)
22942291

22952292
#endif
22962293

2297-
static NPY_INLINE NPY_GCC_OPT_3 npy_intp
2298-
count_nonzero_int16(int ndim, const npy_int16 *data, const npy_intp *ashape, const npy_intp *astrides)
2299-
{
2300-
int idim;
2301-
npy_intp shape[NPY_MAXDIMS], strides[NPY_MAXDIMS];
2302-
npy_intp coord[NPY_MAXDIMS];
2303-
npy_intp count = 0;
2304-
NPY_BEGIN_THREADS_DEF;
2305-
2306-
/* Use raw iteration with no heap memory allocation */
2307-
if (PyArray_PrepareOneRawArrayIter(
2308-
ndim, ashape,
2309-
data, astrides,
2310-
&ndim, shape,
2311-
&data, strides) < 0) {
2312-
return -1;
2313-
}
2314-
2315-
/* Handle zero-sized array */
2316-
if (shape[0] == 0) {
2317-
return 0;
2318-
}
2319-
2320-
NPY_BEGIN_THREADS_THRESHOLDED(shape[0]);
2321-
if (strides[0] == 2) {
2322-
NPY_RAW_ITER_START(idim, ndim, coord, shape) {
2323-
/* Process the innermost dimension */
2324-
const npy_int16 *d = data;
2325-
const npy_int16 *e = data + shape[0];
2326-
npy_uintp stride = shape[0] & -npyv_nlanes_u16;
2327-
count += count_nonzero_int16_simd(d, stride);
2328-
d += stride;
2329-
for (; d < e; ++d) {
2330-
count += (*d != 0);
2331-
}
2332-
} NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, data, strides);
2333-
} else {
2334-
NPY_RAW_ITER_START(idim, ndim, coord, shape) {
2335-
npy_int16 *d = data;
2336-
/* Process the innermost dimension */
2337-
for (npy_intp i = 0; i < shape[0]; ++i, d = ((npy_int8*) d) + strides[0]) {
2338-
count += (*d != 0);
2339-
}
2340-
} NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, data, strides);
2341-
}
2342-
2343-
NPY_END_THREADS;
2344-
2345-
return count;
2346-
}
2347-
23482294

23492295
static NPY_INLINE NPY_GCC_OPT_3 npy_intp
2350-
count_nonzero_int32(int ndim, const npy_int32 *data, const npy_intp *ashape, const npy_intp *astrides)
2296+
count_nonzero_int(int ndim, void *data, const npy_intp *ashape, const npy_intp *astrides, int type_num)
23512297
{
23522298
int idim;
23532299
npy_intp shape[NPY_MAXDIMS], strides[NPY_MAXDIMS];
@@ -2369,83 +2315,58 @@ count_nonzero_int32(int ndim, const npy_int32 *data, const npy_intp *ashape, con
23692315
return 0;
23702316
}
23712317

2372-
NPY_BEGIN_THREADS_THRESHOLDED(shape[0]);
2373-
if (strides[0] == 4) {
2374-
NPY_RAW_ITER_START(idim, ndim, coord, shape) {
2375-
/* Process the innermost dimension */
2376-
const npy_int32 *d = data;
2377-
const npy_int32 *e = data + shape[0];
2378-
npy_uintp stride = shape[0] & -npyv_nlanes_u32;
2379-
count += count_nonzero_int32_simd(d, stride);
2380-
d += stride;
2381-
for (; d < e; ++d) {
2382-
count += (*d != 0);
2383-
}
2384-
} NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, data, strides);
2385-
} else {
2386-
NPY_RAW_ITER_START(idim, ndim, coord, shape) {
2387-
npy_int32 *d = data;
2388-
/* Process the innermost dimension */
2389-
for (npy_intp i = 0; i < shape[0]; ++i, d = ((npy_int8*) d) + strides[0]) {
2390-
count += (*d != 0);
2391-
}
2392-
} NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, data, strides);
2393-
}
2394-
2395-
NPY_END_THREADS;
23962318

2397-
return count;
2398-
}
2319+
#define _ITERATE_INT_SIMPLE(bits) \
2320+
npy_int##bits *d = (npy_int##bits *) data; \
2321+
NPY_RAW_ITER_START(idim, ndim, coord, shape) { \
2322+
/* Process the innermost dimension */ \
2323+
for (npy_intp i = 0; i < shape[0]; ++i, d = ((npy_int8*) d) + strides[0]) { \
2324+
count += (*d != 0); \
2325+
} \
2326+
d = (npy_int##bits *) data; \
2327+
} NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, d, strides);
2328+
2329+
#define _ITERATE_INT(bits, bytes) \
2330+
if (strides[0] == bytes) { \
2331+
npy_int##bits *d2 = (npy_int##bits *) data; \
2332+
NPY_RAW_ITER_START(idim, ndim, coord, shape) { \
2333+
/* Process the innermost dimension */ \
2334+
const npy_int##bits *d = (npy_int##bits *) data; \
2335+
const npy_int##bits *e = ((npy_int##bits *) data) + shape[0]; \
2336+
npy_uintp stride = shape[0] & -npyv_nlanes_u##bits; \
2337+
count += count_nonzero_int##bits##_simd(d, stride); \
2338+
d += stride; \
2339+
for (; d < e; ++d) { \
2340+
count += (*d != 0); \
2341+
} \
2342+
} NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, d2, strides); \
2343+
} else { \
2344+
_ITERATE_INT_SIMPLE(bits) \
2345+
}
23992346

2347+
#if NPY_SIMD
2348+
#define _ITERATE_I16 _ITERATE_INT(16, 2)
2349+
#define _ITERATE_I32 _ITERATE_INT(32, 4)
2350+
#define _ITERATE_I64 _ITERATE_INT(64, 8)
2351+
#else
2352+
#define _ITERATE_I16 _ITERATE_INT_SIMPLE(16)
2353+
#define _ITERATE_I32 _ITERATE_INT_SIMPLE(32)
2354+
#define _ITERATE_I64 _ITERATE_INT_SIMPLE(64)
2355+
#endif
24002356

2401-
static NPY_INLINE NPY_GCC_OPT_3 npy_intp
2402-
count_nonzero_int64(int ndim, const npy_int64 *data, const npy_intp *ashape, const npy_intp *astrides)
2403-
{
2404-
int idim;
2405-
npy_intp shape[NPY_MAXDIMS], strides[NPY_MAXDIMS];
2406-
npy_intp coord[NPY_MAXDIMS];
2407-
npy_intp count = 0;
2408-
NPY_BEGIN_THREADS_DEF;
2357+
NPY_BEGIN_THREADS_THRESHOLDED(shape[0]);
24092358

2410-
/* Use raw iteration with no heap memory allocation */
2411-
if (PyArray_PrepareOneRawArrayIter(
2412-
ndim, ashape,
2413-
data, astrides,
2414-
&ndim, shape,
2415-
&data, strides) < 0) {
2416-
return -1;
2359+
if (type_num == NPY_INT16 || type_num == NPY_UINT16) {
2360+
_ITERATE_I16;
24172361
}
2418-
2419-
/* Handle zero-sized array */
2420-
if (shape[0] == 0) {
2421-
return 0;
2362+
else if (type_num == NPY_INT32 || type_num == NPY_UINT32) {
2363+
_ITERATE_I32;
24222364
}
2423-
2424-
NPY_BEGIN_THREADS_THRESHOLDED(shape[0]);
2425-
2426-
if (strides[0] == 8) {
2427-
NPY_RAW_ITER_START(idim, ndim, coord, shape) {
2428-
/* Process the innermost dimension */
2429-
const npy_int64 *d = data;
2430-
const npy_int64 *e = data + shape[0];
2431-
npy_uintp stride = shape[0] & -npyv_nlanes_u64;
2432-
count += count_nonzero_int64_simd(d, stride);
2433-
d += stride;
2434-
for (; d < e; ++d) {
2435-
count += (*d != 0);
2436-
}
2437-
} NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, data, strides);
2438-
} else {
2439-
NPY_RAW_ITER_START(idim, ndim, coord, shape) {
2440-
npy_int64 *d = data;
2441-
/* Process the innermost dimension */
2442-
for (npy_intp i = 0; i < shape[0]; ++i, d = ((npy_int8*) d) + strides[0]) {
2443-
count += (*d != 0);
2444-
}
2445-
} NPY_RAW_ITER_ONE_NEXT(idim, ndim, coord, shape, data, strides);
2365+
else if (type_num == NPY_INT64 || type_num == NPY_UINT64) {
2366+
_ITERATE_I64;
24462367
}
24472368

2448-
NPY_END_THREADS;
2369+
NPY_END_THREADS;
24492370

24502371
return count;
24512372
}
@@ -2547,23 +2468,28 @@ PyArray_CountNonzero(PyArrayObject *self)
25472468
dtype = PyArray_DESCR(self);
25482469

25492470

2550-
#if NPY_SIMD
2551-
if (dtype->type_num == NPY_INT16 || dtype->type_num == NPY_UINT16) {
2552-
return count_nonzero_int16(PyArray_NDIM(self), (npy_int16 *) PyArray_DATA(self),
2553-
PyArray_DIMS(self), PyArray_STRIDES(self));
2554-
}
2471+
// #if NPY_SIMD
2472+
// if (dtype->type_num == NPY_INT16 || dtype->type_num == NPY_UINT16) {
2473+
// return count_nonzero_int16(PyArray_NDIM(self), (npy_int16 *) PyArray_DATA(self),
2474+
// PyArray_DIMS(self), PyArray_STRIDES(self));
2475+
// }
25552476

2556-
if (dtype->type_num == NPY_INT32 || dtype->type_num == NPY_UINT32) {
2557-
return count_nonzero_int32(PyArray_NDIM(self), (npy_int32 *) PyArray_DATA(self),
2558-
PyArray_DIMS(self), PyArray_STRIDES(self));
2559-
}
2477+
// if (dtype->type_num == NPY_INT32 || dtype->type_num == NPY_UINT32) {
2478+
// return count_nonzero_int32(PyArray_NDIM(self), (npy_int32 *) PyArray_DATA(self),
2479+
// PyArray_DIMS(self), PyArray_STRIDES(self));
2480+
// }
25602481

2561-
if (dtype->type_num == NPY_INT64 || dtype->type_num == NPY_UINT64) {
2562-
return count_nonzero_int64(PyArray_NDIM(self), (npy_int64 *) PyArray_DATA(self),
2563-
PyArray_DIMS(self), PyArray_STRIDES(self));
2564-
}
2482+
// if (dtype->type_num == NPY_INT64 || dtype->type_num == NPY_UINT64) {
2483+
// return count_nonzero_int64(PyArray_NDIM(self), (npy_int64 *) PyArray_DATA(self),
2484+
// PyArray_DIMS(self), PyArray_STRIDES(self));
2485+
// }
25652486

2566-
#endif
2487+
// #endif
2488+
2489+
if (dtype->type_num >= NPY_INT16 && dtype->type_num <= NPY_UINT64) {
2490+
return count_nonzero_int(PyArray_NDIM(self), (void *) PyArray_DATA(self),
2491+
PyArray_DIMS(self), PyArray_STRIDES(self), dtype->type_num);
2492+
}
25672493

25682494
if (dtype->type_num == NPY_BOOL || dtype->type_num == NPY_INT8 || dtype->type_num == NPY_UINT8) {
25692495
return count_boolean_trues(PyArray_NDIM(self), PyArray_DATA(self),

0 commit comments

Comments
 (0)