Introduce helper SIMD functions for small byte arrays

author John Naylor <[email protected]>

Wed, 6 Mar 2024 07:22:15 +0000 (14:22 +0700)

committer John Naylor <[email protected]>

Wed, 6 Mar 2024 07:25:20 +0000 (14:25 +0700)
author John Naylor <[email protected]>
Wed, 6 Mar 2024 07:22:15 +0000 (14:22 +0700)
committer John Naylor <[email protected]>
Wed, 6 Mar 2024 07:25:20 +0000 (14:25 +0700)
diff --git a/src/include/port/simd.h b/src/include/port/simd.h

index 263f74c74e4a361c2659ef18e02c33828b073676..326b4faff5151828b95fd94e6df97b9d66c4bfa4 100644 (file)
--- a/src/include/port/simd.h
+++ b/src/include/port/simd.h
@@ -79,6 +79,7 @@ static inline bool vector8_has_le(const Vector8 v, const uint8 c);
  static inline bool vector8_is_highbit_set(const Vector8 v);
  #ifndef USE_NO_SIMD
  static inline bool vector32_is_highbit_set(const Vector32 v);
+static inline uint32 vector8_highbit_mask(const Vector8 v);
  #endif
  
  /* arithmetic operations */
@@ -96,6 +97,7 @@ static inline Vector8 vector8_ssub(const Vector8 v1, const Vector8 v2);
   */
  #ifndef USE_NO_SIMD
  static inline Vector8 vector8_eq(const Vector8 v1, const Vector8 v2);
+static inline Vector8 vector8_min(const Vector8 v1, const Vector8 v2);
  static inline Vector32 vector32_eq(const Vector32 v1, const Vector32 v2);
  #endif
  
@@ -299,6 +301,36 @@ vector32_is_highbit_set(const Vector32 v)
  }
  #endif                         /* ! USE_NO_SIMD */
  
+/*
+ * Return a bitmask formed from the high-bit of each element.
+ */
+#ifndef USE_NO_SIMD
+static inline uint32
+vector8_highbit_mask(const Vector8 v)
+{
+#ifdef USE_SSE2
+   return (uint32) _mm_movemask_epi8(v);
+#elif defined(USE_NEON)
+   /*
+    * Note: It would be faster to use vget_lane_u64 and vshrn_n_u16, but that
+    * returns a uint64, making it inconvenient to combine mask values from
+    * multiple vectors.
+    */
+   static const uint8 mask[16] = {
+       1 << 0, 1 << 1, 1 << 2, 1 << 3,
+       1 << 4, 1 << 5, 1 << 6, 1 << 7,
+       1 << 0, 1 << 1, 1 << 2, 1 << 3,
+       1 << 4, 1 << 5, 1 << 6, 1 << 7,
+   };
+
+   uint8x16_t  masked = vandq_u8(vld1q_u8(mask), (uint8x16_t) vshrq_n_s8(v, 7));
+   uint8x16_t  maskedhi = vextq_u8(masked, masked, 8);
+
+   return (uint32) vaddvq_u16((uint16x8_t) vzip1q_u8(masked, maskedhi));
+#endif
+}
+#endif                         /* ! USE_NO_SIMD */
+
  /*
   * Return the bitwise OR of the inputs
   */
@@ -372,4 +404,19 @@ vector32_eq(const Vector32 v1, const Vector32 v2)
  }
  #endif                         /* ! USE_NO_SIMD */
  
+/*
+ * Given two vectors, return a vector with the minimum element of each.
+ */
+#ifndef USE_NO_SIMD
+static inline Vector8
+vector8_min(const Vector8 v1, const Vector8 v2)
+{
+#ifdef USE_SSE2
+   return _mm_min_epu8(v1, v2);
+#elif defined(USE_NEON)
+   return vminq_u8(v1, v2);
+#endif
+}
+#endif                         /* ! USE_NO_SIMD */
+
  #endif                         /* SIMD_H */
author	John Naylor <[email protected]>
	Wed, 6 Mar 2024 07:22:15 +0000 (14:22 +0700)
committer	John Naylor <[email protected]>
	Wed, 6 Mar 2024 07:25:20 +0000 (14:25 +0700)