27
27
#include "arrayobject.h"
28
28
#include "alloc.h"
29
29
#include "typeinfo.h"
30
+ #if defined(__ARM_NEON__ ) || defined (__ARM_NEON )
31
+ #include <arm_neon.h>
32
+ #endif
30
33
#ifdef NPY_HAVE_SSE2_INTRINSICS
31
34
#include <emmintrin.h>
32
35
#endif
@@ -3070,7 +3073,15 @@ finish:
3070
3073
** ARGFUNC **
3071
3074
*****************************************************************************
3072
3075
*/
3073
-
3076
+ #if defined(__ARM_NEON__ ) || defined (__ARM_NEON )
3077
+ int32_t _mm_movemask_epi8_neon (uint8x16_t input )
3078
+ {
3079
+ int8x8_t m0 = vcreate_s8 (0x0706050403020100ULL );
3080
+ uint8x16_t v0 = vshlq_u8 (vshrq_n_u8 (input , 7 ), vcombine_s8 (m0 , m0 ));
3081
+ uint64x2_t v1 = vpaddlq_u32 (vpaddlq_u16 (vpaddlq_u8 (v0 )));
3082
+ return (int )vgetq_lane_u64 (v1 , 0 ) + ((int )vgetq_lane_u64 (v1 , 1 ) << 8 );
3083
+ }
3084
+ #endif
3074
3085
#define _LESS_THAN_OR_EQUAL (a ,b ) ((a) <= (b))
3075
3086
3076
3087
static int
@@ -3091,6 +3102,19 @@ BOOL_argmax(npy_bool *ip, npy_intp n, npy_intp *max_ind,
3091
3102
break ;
3092
3103
}
3093
3104
}
3105
+ #else
3106
+ #if defined(__ARM_NEON__ ) || defined (__ARM_NEON )
3107
+ uint8x16_t zero = vdupq_n_u8 (0 );
3108
+ for (; i < n - (n % 32 ); i += 32 ) {
3109
+ uint8x16_t d1 = vld1q_u8 ((char * )& ip [i ]);
3110
+ uint8x16_t d2 = vld1q_u8 ((char * )& ip [i + 16 ]);
3111
+ d1 = vceqq_u8 (d1 , zero );
3112
+ d2 = vceqq_u8 (d2 , zero );
3113
+ if (_mm_movemask_epi8_neon (vminq_u8 (d1 , d2 )) != 0xFFFF ) {
3114
+ break ;
3115
+ }
3116
+ }
3117
+ #endif
3094
3118
#endif
3095
3119
for (; i < n ; i ++ ) {
3096
3120
if (ip [i ]) {
0 commit comments