diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index e9e54470e667..4c4863f6de06 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -581,6 +581,43 @@ fi undefine([Ac_cachevar])dnl ])# PGAC_SSE42_CRC32_INTRINSICS +# PGAC_AVX512_PCLMUL_INTRINSICS +# --------------------------- +# Check if the compiler supports AVX-512 carryless multiplication +# and AVX-512VL instructions used for computing CRC. AVX-512F is +# assumed to be supported if the above are. +# +# If the intrinsics are supported, sets pgac_avx512_pclmul_intrinsics. +AC_DEFUN([PGAC_AVX512_PCLMUL_INTRINSICS], +[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx512_pclmul_intrinsics])])dnl +AC_CACHE_CHECK([for _mm512_clmulepi64_epi128], [Ac_cachevar], +[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include + __m512i x; + __m512i y; + + #if defined(__has_attribute) && __has_attribute (target) + __attribute__((target("vpclmulqdq,avx512vl"))) + #endif + static int avx512_pclmul_test(void) + { + __m128i z; + + y = _mm512_clmulepi64_epi128(x, y, 0); + z = _mm_ternarylogic_epi64( + _mm512_castsi512_si128(y), + _mm512_extracti32x4_epi32(y, 1), + _mm512_extracti32x4_epi32(y, 2), + 0x96); + return _mm_crc32_u64(0, _mm_extract_epi64(z, 0)); + }], + [return avx512_pclmul_test();])], + [Ac_cachevar=yes], + [Ac_cachevar=no])]) +if test x"$Ac_cachevar" = x"yes"; then + pgac_avx512_pclmul_intrinsics=yes +fi +undefine([Ac_cachevar])dnl +])# PGAC_AVX512_PCLMUL_INTRINSICS # PGAC_ARMV8_CRC32C_INTRINSICS # ---------------------------- diff --git a/configure b/configure index 11615d1122de..752b8d1533f3 100755 --- a/configure +++ b/configure @@ -17864,17 +17864,21 @@ fi # Select CRC-32C implementation. # -# If we are targeting a processor that has Intel SSE 4.2 instructions, we can -# use the special CRC instructions for calculating CRC-32C. If we're not -# targeting such a processor, but we can nevertheless produce code that uses -# the SSE intrinsics, compile both implementations and select which one to use -# at runtime, depending on whether SSE 4.2 is supported by the processor we're -# running on. +# There are three methods of calculating CRC, in order of increasing +# performance: # -# Similarly, if we are targeting an ARM processor that has the CRC -# instructions that are part of the ARMv8 CRC Extension, use them. And if -# we're not targeting such a processor, but can nevertheless produce code that -# uses the CRC instructions, compile both, and select at runtime. +# 1. The fallback using a lookup table, called slicing-by-8 +# 2. CRC-32C instructions (found in e.g. Intel SSE 4.2 and ARMv8 CRC Extension) +# 3. Algorithms using carryless multiplication instructions +# (e.g. Intel PCLMUL and Arm PMULL) +# +# If we can produce code (via function attributes or additional compiler +# flags) that uses #2 (and possibly #3), we compile all implementations +# and select which one to use at runtime, depending on what is supported +# by the processor we're running on. +# +# If we are targeting a processor that has #2, we can use that without +# runtime selection. # # Note that we do not use __attribute__((target("..."))) for the ARM CRC # instructions because until clang 16, using the ARM intrinsics still requires @@ -17925,7 +17929,7 @@ if test x"$USE_SSE42_CRC32C" = x"1"; then $as_echo "#define USE_SSE42_CRC32C 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_sse42.o" + PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sse42_choose.o" { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5 $as_echo "SSE 4.2" >&6; } else @@ -17974,6 +17978,71 @@ $as_echo "slicing-by-8" >&6; } fi +# Check for carryless multiplication intrinsics to do vectorized CRC calculations. +# +if test x"$host_cpu" = x"x86_64"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_clmulepi64_epi128" >&5 +$as_echo_n "checking for _mm512_clmulepi64_epi128... " >&6; } +if ${pgac_cv_avx512_pclmul_intrinsics+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include + __m512i x; + __m512i y; + + #if defined(__has_attribute) && __has_attribute (target) + __attribute__((target("vpclmulqdq,avx512vl"))) + #endif + static int avx512_pclmul_test(void) + { + __m128i z; + + y = _mm512_clmulepi64_epi128(x, y, 0); + z = _mm_ternarylogic_epi64( + _mm512_castsi512_si128(y), + _mm512_extracti32x4_epi32(y, 1), + _mm512_extracti32x4_epi32(y, 2), + 0x96); + return _mm_crc32_u64(0, _mm_extract_epi64(z, 0)); + } +int +main () +{ +return avx512_pclmul_test(); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_avx512_pclmul_intrinsics=yes +else + pgac_cv_avx512_pclmul_intrinsics=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_pclmul_intrinsics" >&5 +$as_echo "$pgac_cv_avx512_pclmul_intrinsics" >&6; } +if test x"$pgac_cv_avx512_pclmul_intrinsics" = x"yes"; then + pgac_avx512_pclmul_intrinsics=yes +fi + +fi + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for vectorized CRC-32C" >&5 +$as_echo_n "checking for vectorized CRC-32C... " >&6; } +if test x"$pgac_avx512_pclmul_intrinsics" = x"yes"; then + +$as_echo "#define USE_AVX512_CRC_WITH_RUNTIME_CHECK 1" >>confdefs.h + + { $as_echo "$as_me:${as_lineno-$LINENO}: result: AVX-512 with runtime check" >&5 +$as_echo "AVX-512 with runtime check" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: none" >&5 +$as_echo "none" >&6; } +fi # Select semaphore implementation type. if test "$PORTNAME" != "win32"; then diff --git a/configure.ac b/configure.ac index debdf1650441..f4af238e891c 100644 --- a/configure.ac +++ b/configure.ac @@ -2116,17 +2116,21 @@ AC_SUBST(CFLAGS_CRC) # Select CRC-32C implementation. # -# If we are targeting a processor that has Intel SSE 4.2 instructions, we can -# use the special CRC instructions for calculating CRC-32C. If we're not -# targeting such a processor, but we can nevertheless produce code that uses -# the SSE intrinsics, compile both implementations and select which one to use -# at runtime, depending on whether SSE 4.2 is supported by the processor we're -# running on. -# -# Similarly, if we are targeting an ARM processor that has the CRC -# instructions that are part of the ARMv8 CRC Extension, use them. And if -# we're not targeting such a processor, but can nevertheless produce code that -# uses the CRC instructions, compile both, and select at runtime. +# There are three methods of calculating CRC, in order of increasing +# performance: +# +# 1. The fallback using a lookup table, called slicing-by-8 +# 2. CRC-32C instructions (found in e.g. Intel SSE 4.2 and ARMv8 CRC Extension) +# 3. Algorithms using carryless multiplication instructions +# (e.g. Intel PCLMUL and Arm PMULL) +# +# If we can produce code (via function attributes or additional compiler +# flags) that uses #2 (and possibly #3), we compile all implementations +# and select which one to use at runtime, depending on what is supported +# by the processor we're running on. +# +# If we are targeting a processor that has #2, we can use that without +# runtime selection. # # Note that we do not use __attribute__((target("..."))) for the ARM CRC # instructions because until clang 16, using the ARM intrinsics still requires @@ -2174,7 +2178,7 @@ fi AC_MSG_CHECKING([which CRC-32C implementation to use]) if test x"$USE_SSE42_CRC32C" = x"1"; then AC_DEFINE(USE_SSE42_CRC32C, 1, [Define to 1 use Intel SSE 4.2 CRC instructions.]) - PG_CRC32C_OBJS="pg_crc32c_sse42.o" + PG_CRC32C_OBJS="pg_crc32c_sse42.o pg_crc32c_sse42_choose.o" AC_MSG_RESULT(SSE 4.2) else if test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then @@ -2207,6 +2211,19 @@ else fi AC_SUBST(PG_CRC32C_OBJS) +# Check for carryless multiplication intrinsics to do vectorized CRC calculations. +# +if test x"$host_cpu" = x"x86_64"; then + PGAC_AVX512_PCLMUL_INTRINSICS() +fi + +AC_MSG_CHECKING([for vectorized CRC-32C]) +if test x"$pgac_avx512_pclmul_intrinsics" = x"yes"; then + AC_DEFINE(USE_AVX512_CRC_WITH_RUNTIME_CHECK, 1, [Define to 1 to use AVX-512 CRC algorithms with a runtime check.]) + AC_MSG_RESULT(AVX-512 with runtime check) +else + AC_MSG_RESULT(none) +fi # Select semaphore implementation type. if test "$PORTNAME" != "win32"; then diff --git a/meson.build b/meson.build index 454ed81f5ead..5f346032cb09 100644 --- a/meson.build +++ b/meson.build @@ -2349,17 +2349,21 @@ endif ############################################################### # Select CRC-32C implementation. # -# If we are targeting a processor that has Intel SSE 4.2 instructions, we can -# use the special CRC instructions for calculating CRC-32C. If we're not -# targeting such a processor, but we can nevertheless produce code that uses -# the SSE intrinsics, compile both implementations and select which one to use -# at runtime, depending on whether SSE 4.2 is supported by the processor we're -# running on. +# There are three methods of calculating CRC, in order of increasing +# performance: # -# Similarly, if we are targeting an ARM processor that has the CRC -# instructions that are part of the ARMv8 CRC Extension, use them. And if -# we're not targeting such a processor, but can nevertheless produce code that -# uses the CRC instructions, compile both, and select at runtime. +# 1. The fallback using a lookup table, called slicing-by-8 +# 2. CRC-32C instructions (found in e.g. Intel SSE 4.2 and ARMv8 CRC Extension) +# 3. Algorithms using carryless multiplication instructions +# (e.g. Intel PCLMUL and Arm PMULL) +# +# If we can produce code (via function attributes or additional compiler +# flags) that uses #2 (and possibly #3), we compile all implementations +# and select which one to use at runtime, depending on what is supported +# by the processor we're running on. +# +# If we are targeting a processor that has #2, we can use that without +# runtime selection. # # Note that we do not use __attribute__((target("..."))) for the ARM CRC # instructions because until clang 16, using the ARM intrinsics still requires @@ -2393,7 +2397,7 @@ int main(void) } ''' - if not cc.links(prog, name: '_mm_crc32_u8 and _mm_crc32_u32', + if not cc.links(prog, name: 'SSE 4.2 CRC32C', args: test_c_args) # Do not use Intel SSE 4.2 elif (cc.get_define('__SSE4_2__') != '') @@ -2408,6 +2412,38 @@ int main(void) have_optimized_crc = true endif + # Check if the compiler supports AVX-512 carryless multiplication + # and AVX-512VL instructions used for computing CRC. AVX-512F is + # assumed to be supported if the above are. + prog = ''' +#include +__m512i x; +__m512i y; + +#if defined(__has_attribute) && __has_attribute (target) +__attribute__((target("vpclmulqdq,avx512vl"))) +#endif +int main(void) +{ + __m128i z; + + y = _mm512_clmulepi64_epi128(x, y, 0); + z = _mm_ternarylogic_epi64( + _mm512_castsi512_si128(y), + _mm512_extracti32x4_epi32(y, 1), + _mm512_extracti32x4_epi32(y, 2), + 0x96); + /* return computed value, to prevent the above being optimized away */ + return _mm_crc32_u64(0, _mm_extract_epi64(z, 0)); +} +''' + + if cc.links(prog, + name: 'AVX-512 CRC32C', + args: test_c_args) + cdata.set('USE_AVX512_CRC32C_WITH_RUNTIME_CHECK', 1) + endif + endif elif host_cpu == 'arm' or host_cpu == 'aarch64' diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index c2f1241b2342..f4152a6d75b0 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -665,6 +665,9 @@ /* Define to 1 to build with assertion checks. (--enable-cassert) */ #undef USE_ASSERT_CHECKING +/* Define to 1 to use AVX-512 CRC algorithms with a runtime check. */ +#undef USE_AVX512_CRC_WITH_RUNTIME_CHECK + /* Define to 1 to use AVX-512 popcount instructions with a runtime check. */ #undef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h index 9376d223feff..82313bb7fcfe 100644 --- a/src/include/port/pg_crc32c.h +++ b/src/include/port/pg_crc32c.h @@ -42,7 +42,10 @@ typedef uint32 pg_crc32c; #define EQ_CRC32C(c1, c2) ((c1) == (c2)) #if defined(USE_SSE42_CRC32C) -/* Use Intel SSE4.2 instructions. */ +/* + * Use either Intel SSE 4.2 or AVX-512 instructions. We don't need a runtime check + * for SSE 4.2, so we can inline those in some cases. + */ #include @@ -50,7 +53,11 @@ typedef uint32 pg_crc32c; ((crc) = pg_comp_crc32c_dispatch((crc), (data), (len))) #define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF) +extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len); extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len); +#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK +extern pg_crc32c pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t len); +#endif /* * We can only get here if the host compiler targets SSE 4.2, but on some @@ -82,9 +89,27 @@ pg_comp_crc32c_dispatch(pg_crc32c crc, const void *data, size_t len) return crc; } else - return pg_comp_crc32c_sse42(crc, data, len); + /* Otherwise, use a runtime check for AVX-512 instructions. */ + return pg_comp_crc32c(crc, data, len); } +#elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) + +/* + * Use Intel SSE 4.2 or AVX-512 instructions, but perform a runtime check first + * to check that they are available. + */ +#define COMP_CRC32C(crc, data, len) \ + ((crc) = pg_comp_crc32c((crc), (data), (len))) +#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF) + +extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len); +extern PGDLLIMPORT pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len); +extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len); +#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK +extern pg_crc32c pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t len); +#endif + #elif defined(USE_ARMV8_CRC32C) /* Use ARMv8 CRC Extension instructions. */ @@ -103,10 +128,10 @@ extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t le extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len); -#elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) || defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK) +#elif defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK) /* - * Use Intel SSE 4.2 or ARMv8 instructions, but perform a runtime check first + * Use ARMv8 instructions, but perform a runtime check first * to check that they are available. */ #define COMP_CRC32C(crc, data, len) \ @@ -115,13 +140,7 @@ extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_ extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len); extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len); - -#ifdef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK -extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len); -#endif -#ifdef USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len); -#endif #else /* diff --git a/src/port/meson.build b/src/port/meson.build index 51041e756099..48d2dfb7cf3d 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -86,6 +86,7 @@ replace_funcs_pos = [ # x86/x64 ['pg_crc32c_sse42', 'USE_SSE42_CRC32C'], ['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], + ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C'], ['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], ['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'], diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c index 22c2137df31c..e2b1246f01d3 100644 --- a/src/port/pg_crc32c_sse42.c +++ b/src/port/pg_crc32c_sse42.c @@ -1,7 +1,7 @@ /*------------------------------------------------------------------------- * * pg_crc32c_sse42.c - * Compute CRC-32C checksum using Intel SSE 4.2 instructions. + * Compute CRC-32C checksum using Intel SSE 4.2 or AVX-512 instructions. * * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California @@ -15,9 +15,14 @@ #include "c.h" #include +#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK +#include +#endif #include "port/pg_crc32c.h" +#define DEBUG_CRC /* XXX not for commit, or at least comment out */ + pg_attribute_no_sanitize_alignment() pg_attribute_target("sse4.2") pg_crc32c @@ -68,3 +73,101 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len) return crc; } + +#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK + +/* + * Note: There is no copyright notice in the following generated code. + * + * We have modified the output to + * - match our function declaration + * - make whitespace match project style + * - add a threshold for the alignment stanza + */ + +/* Generated by https://github.com/corsix/fast-crc32/ using: */ +/* ./generate -i avx512_vpclmulqdq -p crc32c -a v1e */ +/* MIT licensed */ + +#define clmul_lo(a, b) (_mm512_clmulepi64_epi128((a), (b), 0)) +#define clmul_hi(a, b) (_mm512_clmulepi64_epi128((a), (b), 17)) + +pg_attribute_target("vpclmulqdq,avx512vl") +pg_crc32c +pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t length) +{ + /* adjust names to match generated code */ + pg_crc32c crc0 = crc; + size_t len = length; + const char *buf = data; +#ifdef DEBUG_CRC + const size_t orig_len PG_USED_FOR_ASSERTS_ONLY = len; +#endif + + /* Align on cacheline boundary. The threshold is somewhat arbitrary. */ + if (unlikely(len > 256)) + { + for (; len && ((uintptr_t) buf & 7); --len) + crc0 = _mm_crc32_u8(crc0, *buf++); + while (((uintptr_t) buf & 56) && len >= 8) + { + crc0 = _mm_crc32_u64(crc0, *(const uint64_t *) buf); + buf += 8; + len -= 8; + } + } + + if (len >= 64) + { + const char *end = buf + len; + const char *limit = buf + len - 64; + __m128i z0; + + /* First vector chunk. */ + __m512i x0 = _mm512_loadu_si512((const void *) buf), + y0; + __m512i k; + + k = _mm512_broadcast_i32x4(_mm_setr_epi32(0x740eef02, 0, 0x9e4addf8, 0)); + x0 = _mm512_xor_si512(_mm512_castsi128_si512(_mm_cvtsi32_si128(crc0)), x0); + buf += 64; + + /* Main loop. */ + while (buf <= limit) + { + y0 = clmul_lo(x0, k), x0 = clmul_hi(x0, k); + x0 = _mm512_ternarylogic_epi64(x0, y0, + _mm512_loadu_si512((const void *) buf), + 0x96); + buf += 64; + } + + /* Reduce 512 bits to 128 bits. */ + k = _mm512_setr_epi32(0x1c291d04, 0, 0xddc0152b, 0, + 0x3da6d0cb, 0, 0xba4fc28e, 0, + 0xf20c0dfe, 0, 0x493c7d27, 0, + 0, 0, 0, 0); + y0 = clmul_lo(x0, k), k = clmul_hi(x0, k); + y0 = _mm512_xor_si512(y0, k); + z0 = _mm_ternarylogic_epi64(_mm512_castsi512_si128(y0), + _mm512_extracti32x4_epi32(y0, 1), + _mm512_extracti32x4_epi32(y0, 2), + 0x96); + z0 = _mm_xor_si128(z0, _mm512_extracti32x4_epi32(x0, 3)); + + /* Reduce 128 bits to 32 bits, and multiply by x^32. */ + crc0 = _mm_crc32_u64(0, _mm_extract_epi64(z0, 0)); + crc0 = _mm_crc32_u64(crc0, _mm_extract_epi64(z0, 1)); + len = end - buf; + } + + crc0 = pg_comp_crc32c_sse42(crc0, buf, len); + +#ifdef DEBUG_CRC + Assert(crc0 == pg_comp_crc32c_sse42(crc, data, orig_len)); +#endif + + return crc0; +} + +#endif diff --git a/src/port/pg_crc32c_sse42_choose.c b/src/port/pg_crc32c_sse42_choose.c index 65dbc4d42497..74d2421ba2be 100644 --- a/src/port/pg_crc32c_sse42_choose.c +++ b/src/port/pg_crc32c_sse42_choose.c @@ -20,30 +20,37 @@ #include "c.h" -#ifdef HAVE__GET_CPUID +#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT) #include #endif -#ifdef HAVE__CPUID +#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX) #include #endif +#ifdef HAVE_XSAVE_INTRINSICS +#include +#endif + #include "port/pg_crc32c.h" +/* + * Does XGETBV say the ZMM registers are enabled? + * + * NB: Caller is responsible for verifying that osxsave is available + * before calling this. + */ +#ifdef HAVE_XSAVE_INTRINSICS +pg_attribute_target("xsave") +#endif static bool -pg_crc32c_sse42_available(void) +zmm_regs_available(void) { - unsigned int exx[4] = {0, 0, 0, 0}; - -#if defined(HAVE__GET_CPUID) - __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]); -#elif defined(HAVE__CPUID) - __cpuid(exx, 1); +#ifdef HAVE_XSAVE_INTRINSICS + return (_xgetbv(0) & 0xe6) == 0xe6; #else -#error cpuid instruction not available + return false; #endif - - return (exx[2] & (1 << 20)) != 0; /* SSE 4.2 */ } /* @@ -53,10 +60,48 @@ pg_crc32c_sse42_available(void) static pg_crc32c pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len) { - if (pg_crc32c_sse42_available()) + unsigned int exx[4] = {0, 0, 0, 0}; + + /* + * Set fallback. We must guard since slicing-by-8 is not visible + * everywhere. + */ +#ifdef USE_SSE42_CRC32C_WITH_RUNTIME_CHECK + pg_comp_crc32c = pg_comp_crc32c_sb8; +#endif + +#if defined(HAVE__GET_CPUID) + __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]); +#elif defined(HAVE__CPUID) + __cpuid(exx, 1); +#else +#error cpuid instruction not available +#endif + + if ((exx[2] & (1 << 20)) != 0) /* SSE 4.2 */ + { pg_comp_crc32c = pg_comp_crc32c_sse42; - else - pg_comp_crc32c = pg_comp_crc32c_sb8; + + if (exx[2] & (1 << 27) && /* OSXSAVE */ + zmm_regs_available()) + { + /* second cpuid call on leaf 7 to check extended AVX-512 support */ + + memset(exx, 0, 4 * sizeof(exx[0])); + +#if defined(HAVE__GET_CPUID_COUNT) + __get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]); +#elif defined(HAVE__CPUIDEX) + __cpuidex(exx, 7, 0); +#endif + +#ifdef USE_AVX512_CRC32C_WITH_RUNTIME_CHECK + if (exx[2] & (1 << 10) && /* VPCLMULQDQ */ + exx[1] & (1 << 31)) /* AVX512-VL */ + pg_comp_crc32c = pg_comp_crc32c_avx512; +#endif + } + } return pg_comp_crc32c(crc, data, len); } diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out index dc485735aa45..174f0a68331b 100644 --- a/src/test/regress/expected/strings.out +++ b/src/test/regress/expected/strings.out @@ -2330,6 +2330,30 @@ SELECT crc32c('The quick brown fox jumps over the lazy dog.'); 419469235 (1 row) +SELECT crc32c(repeat('A', 127)::bytea); + crc32c +----------- + 291820082 +(1 row) + +SELECT crc32c(repeat('A', 128)::bytea); + crc32c +----------- + 816091258 +(1 row) + +SELECT crc32c(repeat('A', 129)::bytea); + crc32c +------------ + 4213642571 +(1 row) + +SELECT crc32c(repeat('A', 800)::bytea); + crc32c +------------ + 3134039419 +(1 row) + -- -- encode/decode -- diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql index aeba798dac1f..f7b325baadf4 100644 --- a/src/test/regress/sql/strings.sql +++ b/src/test/regress/sql/strings.sql @@ -738,6 +738,11 @@ SELECT crc32('The quick brown fox jumps over the lazy dog.'); SELECT crc32c(''); SELECT crc32c('The quick brown fox jumps over the lazy dog.'); +SELECT crc32c(repeat('A', 127)::bytea); +SELECT crc32c(repeat('A', 128)::bytea); +SELECT crc32c(repeat('A', 129)::bytea); +SELECT crc32c(repeat('A', 800)::bytea); + -- -- encode/decode --