Fix ARM NEON detection on MSVC arm64

MSVC doesn't define __ARM_NEON, but neon is a necessary part of arm64, so will always be there. At the same time fix the NEON code for MSVC which has a different idea of what the intrinsics types are, even if all the intrinsic functions are the same. This has two consequences: 1. Since NEON lacks construction intrinsics except duplication, NEON mask constants needs to be built differently. 2. Since MSVC has all the NEON types aliases of the same underlying type, QSimdNeon cant do the same type based dispatch as before. Fixes: QTBUG-127646 Pick-to: 6.8 Change-Id: I8038bb6bb4557e8ce29e3844f2742a97b4489818 Reviewed-by: Oliver Wolff <[email protected]>
author: Allan Sandfeld Jensen <[email protected]> 2024-08-07 14:33:51 +0200
committer: Allan Sandfeld Jensen <[email protected]> 2024-08-12 12:10:08 +0200
commit: 572aa7caa04e85cbc07b18e1d0c720038facbf83 (patch)
tree: da05d6586f086f0e30b763b75b24acf8b3fb0d78
parent: e8b1566f0e919d87e58e0fe65607e2d498fb379c (diff)
11 files changed, 86 insertions, 27 deletions
diff --git a/config.tests/arch/arch.cpp b/config.tests/arch/arch.cpp
index 44ec7214320..334e48f30d3 100644
--- a/config.tests/arch/arch.cpp
+++ b/config.tests/arch/arch.cpp
@@ -239,7 +239,7 @@ const char msg2[] = "==Qt=magic=Qt== Sub-architecture:"
 #endif
 
 // -- ARM --
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+#if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(_M_ARM64)
 " neon"
 #endif
 #ifdef __IWMMXT__
diff --git a/src/corelib/global/qsimd.cpp b/src/corelib/global/qsimd.cpp
index 2bd1aed573e..1d3214b273e 100644
--- a/src/corelib/global/qsimd.cpp
+++ b/src/corelib/global/qsimd.cpp
@@ -151,7 +151,7 @@ static inline quint64 detectProcessorFeatures()
         features |= CpuFeatureAES;
     return features;
 #endif
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#if defined(__ARM_NEON__)
     features |= CpuFeatureNEON;
 #endif
 #if defined(__ARM_FEATURE_CRC32)
diff --git a/src/corelib/global/qsimd.h b/src/corelib/global/qsimd.h
index 4ef925ca337..8ee3e9b15d8 100644
--- a/src/corelib/global/qsimd.h
+++ b/src/corelib/global/qsimd.h
@@ -34,7 +34,7 @@
 
 #define QT_COMPILER_USES(feature) (1/QT_COMPILER_USES_##feature == 1)
 
-#if defined(Q_PROCESSOR_ARM) && defined(__ARM_NEON) || defined(__ARM_NEON__)
+#if defined(Q_PROCESSOR_ARM) && defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(_M_ARM64)
 #  include <arm_neon.h>
 #  define QT_COMPILER_USES_neon 1
 #else
diff --git a/src/corelib/global/qsimd_p.h b/src/corelib/global/qsimd_p.h
index 012eb6cf4f4..b9cd296c9e8 100644
--- a/src/corelib/global/qsimd_p.h
+++ b/src/corelib/global/qsimd_p.h
@@ -257,7 +257,7 @@ static_assert(ARCH_SKX_MACROS, "Undeclared identifiers indicate which features a
 
 // NEON intrinsics
 // note: as of GCC 4.9, does not support function targets for ARM
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+#if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(_M_ARM64)
 #if defined(Q_CC_CLANG)
 #define QT_FUNCTION_TARGET_STRING_NEON      "neon"
 #else
diff --git a/src/corelib/text/qstring.cpp b/src/corelib/text/qstring.cpp
index 9a09675eb7e..c4c4c528ccd 100644
--- a/src/corelib/text/qstring.cpp
+++ b/src/corelib/text/qstring.cpp
@@ -749,7 +749,11 @@ const char16_t *QtPrivate::qustrchr(QStringView str, char16_t c) noexcept
                                    [=](qsizetype i) { return n + i; });
 #  endif
 #elif defined(__ARM_NEON__)
+#ifdef _MSC_VER
+    const uint16x8_t vmask = { 0x0008000400020001ULL, 0x0080004000200010ULL };
+#else
     const uint16x8_t vmask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
+#endif
     const uint16x8_t ch_vec = vdupq_n_u16(c);
     for (const char16_t *next = n + 8; next <= e; n = next, next += 8) {
         uint16x8_t data = vld1q_u16(reinterpret_cast<const uint16_t *>(n));
@@ -1290,7 +1294,11 @@ static int ucstrncmp(const char16_t *a, const char16_t *b, size_t l)
 #  elif defined(__ARM_NEON__)
     if (l >= 8) {
         const char16_t *end = a + l;
+#ifdef _MSC_VER
+        const uint16x8_t mask = { 0x0008000400020001ULL, 0x0080004000200010ULL };
+#else
         const uint16x8_t mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
+#endif
         while (end - a > 7) {
             uint16x8_t da = vld1q_u16(reinterpret_cast<const uint16_t *>(a));
             uint16x8_t db = vld1q_u16(reinterpret_cast<const uint16_t *>(b));
diff --git a/src/corelib/text/qstringconverter.cpp b/src/corelib/text/qstringconverter.cpp
index 9fc3318d716..57e0e30d9c8 100644
--- a/src/corelib/text/qstringconverter.cpp
+++ b/src/corelib/text/qstringconverter.cpp
@@ -351,7 +351,12 @@ static void simdCompareAscii(const qchar8_t *&src8, const qchar8_t *end8, const
 static inline bool simdEncodeAscii(uchar *&dst, const char16_t *&nextAscii, const char16_t *&src, const char16_t *end)
 {
     uint16x8_t maxAscii = vdupq_n_u16(0x7f);
-    uint16x8_t mask1 = { 1,      1 << 2, 1 << 4, 1 << 6, 1 << 8, 1 << 10, 1 << 12, 1 << 14 };
+#ifdef _MSC_VER
+    uint16_t mask1t[8] = { 1, 1 << 2, 1 << 4, 1 << 6, 1 << 8, 1 << 10, 1 << 12, 1 << 14 };
+    uint16x8_t mask1 = vld1q_u16(mask1t);
+#else
+    uint16x8_t mask1 = { 1, 1 << 2, 1 << 4, 1 << 6, 1 << 8, 1 << 10, 1 << 12, 1 << 14 };
+#endif
     uint16x8_t mask2 = vshlq_n_u16(mask1, 1);
 
     // do sixteen characters at a time
@@ -389,7 +394,12 @@ static inline bool simdDecodeAscii(char16_t *&dst, const uchar *&nextAscii, cons
 {
     // do eight characters at a time
     uint8x8_t msb_mask = vdup_n_u8(0x80);
+#ifdef _MSC_VER
+    uint8_t add_maskt[8] = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
+    uint8x8_t add_mask = vld1_u8(add_maskt);
+#else
     uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
+#endif
     for ( ; end - src >= 8; src += 8, dst += 8) {
         uint8x8_t c = vld1_u8(src);
         uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
@@ -425,7 +435,12 @@ static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end,
 
     // do eight characters at a time
     uint8x8_t msb_mask = vdup_n_u8(0x80);
+#ifdef _MSC_VER
+    uint8_t add_maskt[8] = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
+    uint8x8_t add_mask = vld1_u8(add_maskt);
+#else
     uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
+#endif
     for ( ; end - src >= 8; src += 8) {
         uint8x8_t c = vld1_u8(src);
         uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
diff --git a/src/gui/painting/qcolortransform.cpp b/src/gui/painting/qcolortransform.cpp
index 52fbebfa5db..14200ccf9d5 100644
--- a/src/gui/painting/qcolortransform.cpp
+++ b/src/gui/painting/qcolortransform.cpp
@@ -662,18 +662,27 @@ static inline bool test_all_zero(uint32x4_t p)
 #endif
 }
 
+static inline uint32x4_t vsetq_u32(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
+{
+#ifdef _MSC_VER
+    return uint32x4_t{ (uint64_t(b) << 32) | a, (uint64_t(d) << 32) | c };
+#else
+    return uint32x4_t{ a, b, c, d };
+#endif
+}
+
 template<typename T>
 static void loadPremultiplied(QColorVector *buffer, const T *src, const qsizetype len, const QColorTransformPrivate *d_ptr)
 {
     constexpr bool isARGB = isArgb<T>();
     const float iFF00 = 1.0f / (255 * 256);
-    const uint32x4_t vRangeMax = {
-        isARGB ? d_ptr->colorSpaceIn->lut[2]->m_unclampedToLinear
-              : d_ptr->colorSpaceIn->lut[0]->m_unclampedToLinear,
-        d_ptr->colorSpaceIn->lut[1]->m_unclampedToLinear,
-        isARGB ? d_ptr->colorSpaceIn->lut[0]->m_unclampedToLinear
-              : d_ptr->colorSpaceIn->lut[2]->m_unclampedToLinear,
-        QColorTrcLut::Resolution };
+    const uint32x4_t vRangeMax = vsetq_u32(
+            isARGB ? d_ptr->colorSpaceIn->lut[2]->m_unclampedToLinear
+                   : d_ptr->colorSpaceIn->lut[0]->m_unclampedToLinear,
+            d_ptr->colorSpaceIn->lut[1]->m_unclampedToLinear,
+            isARGB ? d_ptr->colorSpaceIn->lut[0]->m_unclampedToLinear
+                   : d_ptr->colorSpaceIn->lut[2]->m_unclampedToLinear,
+            QColorTrcLut::Resolution);
     for (qsizetype i = 0; i < len; ++i) {
         uint32x4_t v;
         loadP<T>(src[i], v);
@@ -740,13 +749,13 @@ void loadUnpremultiplied(QColorVector *buffer, const T *src, const qsizetype len
 {
     constexpr bool isARGB = isArgb<T>();
     const float iFF00 = 1.0f / (255 * 256);
-    const uint32x4_t vRangeMax = {
-        isARGB ? d_ptr->colorSpaceIn->lut[2]->m_unclampedToLinear
-               : d_ptr->colorSpaceIn->lut[0]->m_unclampedToLinear,
-        d_ptr->colorSpaceIn->lut[1]->m_unclampedToLinear,
-        isARGB ? d_ptr->colorSpaceIn->lut[0]->m_unclampedToLinear
-               : d_ptr->colorSpaceIn->lut[2]->m_unclampedToLinear,
-        QColorTrcLut::Resolution };
+    const uint32x4_t vRangeMax = vsetq_u32(
+            isARGB ? d_ptr->colorSpaceIn->lut[2]->m_unclampedToLinear
+                   : d_ptr->colorSpaceIn->lut[0]->m_unclampedToLinear,
+            d_ptr->colorSpaceIn->lut[1]->m_unclampedToLinear,
+            isARGB ? d_ptr->colorSpaceIn->lut[0]->m_unclampedToLinear
+                   : d_ptr->colorSpaceIn->lut[2]->m_unclampedToLinear,
+            QColorTrcLut::Resolution);
     for (qsizetype i = 0; i < len; ++i) {
         uint32x4_t v;
         loadPU<T>(src[i], v);
diff --git a/src/gui/painting/qcolortrclut_p.h b/src/gui/painting/qcolortrclut_p.h
index 220b504a83e..da5e9e60793 100644
--- a/src/gui/painting/qcolortrclut_p.h
+++ b/src/gui/painting/qcolortrclut_p.h
@@ -18,13 +18,14 @@
 #include <QtGui/private/qtguiglobal_p.h>
 #include <QtGui/qrgb.h>
 #include <QtGui/qrgba64.h>
+#include <QtCore/private/qsimd_p.h>
 
 #include <cmath>
 #include <memory>
 
 #if defined(__SSE2__)
 #include <emmintrin.h>
-#elif defined(__ARM_NEON__) || defined(__ARM_NEON)
+#elif defined(__ARM_NEON__)
 #include <arm_neon.h>
 #endif
 
@@ -75,7 +76,7 @@ public:
         QRgba64 rgba64;
         _mm_storel_epi64(reinterpret_cast<__m128i *>(&rgba64), v);
         return rgba64;
-#elif (defined(__ARM_NEON__) || defined(__ARM_NEON)) && Q_BYTE_ORDER == Q_LITTLE_ENDIAN
+#elif defined(__ARM_NEON__) && Q_BYTE_ORDER == Q_LITTLE_ENDIAN
         uint8x8_t v8 = vreinterpret_u8_u32(vmov_n_u32(rgb32));
         uint16x4_t v16 = vget_low_u16(vmovl_u8(v8));
         const uint16x4_t vidx = vshl_n_u16(v16, ShiftUp);
@@ -145,7 +146,7 @@ public:
         v = _mm_srli_epi16(v, 8);
         v = _mm_packus_epi16(v, v);
         return _mm_cvtsi128_si32(v);
-#elif (defined(__ARM_NEON__) || defined(__ARM_NEON)) && Q_BYTE_ORDER == Q_LITTLE_ENDIAN
+#elif defined(__ARM_NEON__) && Q_BYTE_ORDER == Q_LITTLE_ENDIAN
         uint16x4_t v = vreinterpret_u16_u64(vmov_n_u64(rgb64));
         v = vsub_u16(v, vshr_n_u16(v, 8));
         const uint16x4_t vidx = vshr_n_u16(v, ShiftDown);
@@ -236,7 +237,7 @@ private:
         QRgba64 rgba64;
         _mm_storel_epi64(reinterpret_cast<__m128i *>(&rgba64), v);
         return rgba64;
-#elif (defined(__ARM_NEON__) || defined(__ARM_NEON)) && Q_BYTE_ORDER == Q_LITTLE_ENDIAN
+#elif defined(__ARM_NEON__) && Q_BYTE_ORDER == Q_LITTLE_ENDIAN
         uint16x4_t v = vreinterpret_u16_u64(vmov_n_u64(rgb64));
         v = vsub_u16(v, vshr_n_u16(v, 8));
         const uint16x4_t vidx = vshr_n_u16(v, ShiftDown);
diff --git a/src/gui/painting/qdrawhelper_neon.cpp b/src/gui/painting/qdrawhelper_neon.cpp
index 1fceb83710b..f8de0a85116 100644
--- a/src/gui/painting/qdrawhelper_neon.cpp
+++ b/src/gui/painting/qdrawhelper_neon.cpp
@@ -1008,8 +1008,18 @@ void qt_memrotate270_16_neon(const uchar *srcPixels, int w, int h,
 class QSimdNeon
 {
 public:
-    typedef int32x4_t Int32x4;
-    typedef float32x4_t Float32x4;
+    struct Int32x4 {
+        Int32x4() = default;
+        Int32x4(int32x4_t v) : v(v) {}
+        int32x4_t v;
+        operator int32x4_t() const { return v; }
+    };
+    struct Float32x4 {
+        Float32x4() = default;
+        Float32x4(float32x4_t v) : v(v) {};
+        float32x4_t v;
+        operator float32x4_t() const { return v; }
+    };
 
     union Vect_buffer_i { Int32x4 v; int i[4]; };
     union Vect_buffer_f { Float32x4 v; float f[4]; };
@@ -1059,7 +1069,9 @@ const uint * QT_FASTCALL qt_fetchUntransformed_888_neon(uint *buffer, const Oper
 #if Q_BYTE_ORDER == Q_LITTLE_ENDIAN
 static inline uint32x4_t vrgba2argb(uint32x4_t srcVector)
 {
-#if defined(Q_PROCESSOR_ARM_64)
+#if defined(Q_PROCESSOR_ARM_64) && defined(_MSC_VER)
+    const uint8x16_t rgbaMask  = { 0x0704050603000102ULL, 0x0F0C0D0E0B08090AULL };
+#elif defined(Q_PROCESSOR_ARM_64)
     const uint8x16_t rgbaMask  = { 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15};
 #else
     const uint8x8_t rgbaMask  = { 2, 1, 0, 3, 6, 5, 4, 7 };
@@ -1079,7 +1091,11 @@ template<bool RGBA>
 static inline void convertARGBToARGB32PM_neon(uint *buffer, const uint *src, int count)
 {
     int i = 0;
+#ifdef _MSC_VER
+    const uint8x8_t shuffleMask = { 0x0707070703030303ULL };
+#else
     const uint8x8_t shuffleMask = { 3, 3, 3, 3, 7, 7, 7, 7};
+#endif
     const uint32x4_t blendMask = vdupq_n_u32(0xff000000);
 
     for (; i < count - 3; i += 4) {
@@ -1131,7 +1147,11 @@ static inline void convertARGB32ToRGBA64PM_neon(QRgba64 *buffer, const uint *src
     if (count <= 0)
         return;
 
+#ifdef _MSC_VER
+    const uint8x8_t shuffleMask = { 0x0707070703030303ULL };
+#else
     const uint8x8_t shuffleMask = { 3, 3, 3, 3, 7, 7, 7, 7};
+#endif
     const uint64x2_t blendMask = vdupq_n_u64(Q_UINT64_C(0xffff000000000000));
 
     int i = 0;
diff --git a/src/gui/painting/qpixellayout.cpp b/src/gui/painting/qpixellayout.cpp
index 4f2f0ae13a0..31646d2f23d 100644
--- a/src/gui/painting/qpixellayout.cpp
+++ b/src/gui/painting/qpixellayout.cpp
@@ -1087,7 +1087,9 @@ static inline void qConvertARGB32PMToRGBA64PM_neon(QRgba64 *buffer, const uint *
         return;
 
     const uint32x4_t amask = vdupq_n_u32(0xff000000);
-#if defined(Q_PROCESSOR_ARM_64)
+#if defined(Q_PROCESSOR_ARM_64) && defined(_MSC_VER)
+    const uint8x16_t rgbaMask  = { 0x0704050603000102ULL, 0x0F0C0D0E0B08090AULL };
+#elif defined(Q_PROCESSOR_ARM_64)
     const uint8x16_t rgbaMask  = { 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15};
 #else
     const uint8x8_t rgbaMask  = { 2, 1, 0, 3, 6, 5, 4, 7 };
diff --git a/src/gui/painting/qrgba64_p.h b/src/gui/painting/qrgba64_p.h
index 6fd7d7ff2c1..6809f1d52cf 100644
--- a/src/gui/painting/qrgba64_p.h
+++ b/src/gui/painting/qrgba64_p.h
@@ -232,7 +232,11 @@ static inline uint toArgb32(QRgba64 rgba64)
 #elif defined __ARM_NEON__
     uint16x4_t v = vreinterpret_u16_u64(vld1_u64(reinterpret_cast<const uint64_t *>(&rgba64)));
 #if Q_BYTE_ORDER == Q_LITTLE_ENDIAN
+#ifdef _MSC_VER
+    const uint8x8_t shuffleMask = { 0x0706010003020504ULL };
+#else
     const uint8x8_t shuffleMask = { 4, 5, 2, 3, 0, 1, 6, 7 };
+#endif
     v = vreinterpret_u16_u8(vtbl1_u8(vreinterpret_u8_u16(v), shuffleMask));
 #else
     v = vext_u16(v, v, 3);
author	Allan Sandfeld Jensen <[email protected]>	2024-08-07 14:33:51 +0200
committer	Allan Sandfeld Jensen <[email protected]>	2024-08-12 12:10:08 +0200
commit	572aa7caa04e85cbc07b18e1d0c720038facbf83 (patch)
tree	da05d6586f086f0e30b763b75b24acf8b3fb0d78
parent	e8b1566f0e919d87e58e0fe65607e2d498fb379c (diff)