static inline bool vector8_is_highbit_set(const Vector8 v);
#ifndef USE_NO_SIMD
static inline bool vector32_is_highbit_set(const Vector32 v);
+static inline uint32 vector8_highbit_mask(const Vector8 v);
#endif
/* arithmetic operations */
*/
#ifndef USE_NO_SIMD
static inline Vector8 vector8_eq(const Vector8 v1, const Vector8 v2);
+static inline Vector8 vector8_min(const Vector8 v1, const Vector8 v2);
static inline Vector32 vector32_eq(const Vector32 v1, const Vector32 v2);
#endif
}
#endif /* ! USE_NO_SIMD */
+/*
+ * Return a bitmask formed from the high-bit of each element.
+ */
+#ifndef USE_NO_SIMD
+static inline uint32
+vector8_highbit_mask(const Vector8 v)
+{
+#ifdef USE_SSE2
+ return (uint32) _mm_movemask_epi8(v);
+#elif defined(USE_NEON)
+ /*
+ * Note: It would be faster to use vget_lane_u64 and vshrn_n_u16, but that
+ * returns a uint64, making it inconvenient to combine mask values from
+ * multiple vectors.
+ */
+ static const uint8 mask[16] = {
+ 1 << 0, 1 << 1, 1 << 2, 1 << 3,
+ 1 << 4, 1 << 5, 1 << 6, 1 << 7,
+ 1 << 0, 1 << 1, 1 << 2, 1 << 3,
+ 1 << 4, 1 << 5, 1 << 6, 1 << 7,
+ };
+
+ uint8x16_t masked = vandq_u8(vld1q_u8(mask), (uint8x16_t) vshrq_n_s8(v, 7));
+ uint8x16_t maskedhi = vextq_u8(masked, masked, 8);
+
+ return (uint32) vaddvq_u16((uint16x8_t) vzip1q_u8(masked, maskedhi));
+#endif
+}
+#endif /* ! USE_NO_SIMD */
+
/*
* Return the bitwise OR of the inputs
*/
}
#endif /* ! USE_NO_SIMD */
+/*
+ * Given two vectors, return a vector with the minimum element of each.
+ */
+#ifndef USE_NO_SIMD
+static inline Vector8
+vector8_min(const Vector8 v1, const Vector8 v2)
+{
+#ifdef USE_SSE2
+ return _mm_min_epu8(v1, v2);
+#elif defined(USE_NEON)
+ return vminq_u8(v1, v2);
+#endif
+}
+#endif /* ! USE_NO_SIMD */
+
#endif /* SIMD_H */