Optimize pg_popcount() with AVX-512 instructions.
Presently, pg_popcount() processes data in 32-bit or 64-bit chunks when possible. Newer hardware that supports AVX-512 instructions can use 512-bit chunks, which provides a nice speedup, especially for larger buffers. This commit introduces the infrastructure required to detect compiler and CPU support for the required AVX-512 intrinsic functions, and it adds a new pg_popcount() implementation that uses these functions. If CPU support for this optimized implementation is detected at runtime, a function pointer is updated so that it is used by subsequent calls to pg_popcount(). Most of the existing in-tree calls to pg_popcount() should benefit from these instructions, and calls with smaller buffers should at least not regress compared to v16. The new infrastructure introduced by this commit can also be used to optimize visibilitymap_count(), but that is left for a follow-up commit. Co-authored-by: Paul Amonson, Ants Aasma Reviewed-by: Matthias van de Meent, Tom Lane, Noah Misch, Akash Shankaran, Alvaro Herrera, Andres Freund, David Rowley Discussion: https://postgr.es/m/BL1PR11MB5304097DF7EA81D04C33F3D1DCA6A%40BL1PR11MB5304.namprd11.prod.outlook.com
This commit is contained in:
parent
158f581923
commit
792752af4e
15 changed files with 696 additions and 3 deletions
|
@ -694,3 +694,61 @@ if test x"$Ac_cachevar" = x"yes"; then
|
|||
fi
|
||||
undefine([Ac_cachevar])dnl
|
||||
])# PGAC_LOONGARCH_CRC32C_INTRINSICS
|
||||
|
||||
# PGAC_XSAVE_INTRINSICS
|
||||
# ---------------------
|
||||
# Check if the compiler supports the XSAVE instructions using the _xgetbv
|
||||
# intrinsic function.
|
||||
#
|
||||
# An optional compiler flag can be passed as argument (e.g., -mxsave). If the
|
||||
# intrinsic is supported, sets pgac_xsave_intrinsics and CFLAGS_XSAVE.
|
||||
AC_DEFUN([PGAC_XSAVE_INTRINSICS],
|
||||
[define([Ac_cachevar], [AS_TR_SH([pgac_cv_xsave_intrinsics_$1])])dnl
|
||||
AC_CACHE_CHECK([for _xgetbv with CFLAGS=$1], [Ac_cachevar],
|
||||
[pgac_save_CFLAGS=$CFLAGS
|
||||
CFLAGS="$pgac_save_CFLAGS $1"
|
||||
AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h>],
|
||||
[return _xgetbv(0) & 0xe0;])],
|
||||
[Ac_cachevar=yes],
|
||||
[Ac_cachevar=no])
|
||||
CFLAGS="$pgac_save_CFLAGS"])
|
||||
if test x"$Ac_cachevar" = x"yes"; then
|
||||
CFLAGS_XSAVE="$1"
|
||||
pgac_xsave_intrinsics=yes
|
||||
fi
|
||||
undefine([Ac_cachevar])dnl
|
||||
])# PGAC_XSAVE_INTRINSICS
|
||||
|
||||
# PGAC_AVX512_POPCNT_INTRINSICS
|
||||
# -----------------------------
|
||||
# Check if the compiler supports the AVX-512 popcount instructions using the
|
||||
# _mm512_setzero_si512, _mm512_maskz_loadu_epi8, _mm512_popcnt_epi64,
|
||||
# _mm512_add_epi64, and _mm512_reduce_add_epi64 intrinsic functions.
|
||||
#
|
||||
# Optional compiler flags can be passed as argument (e.g., -mavx512vpopcntdq
|
||||
# -mavx512bw). If the intrinsics are supported, sets
|
||||
# pgac_avx512_popcnt_intrinsics and CFLAGS_POPCNT.
|
||||
AC_DEFUN([PGAC_AVX512_POPCNT_INTRINSICS],
|
||||
[define([Ac_cachevar], [AS_TR_SH([pgac_cv_avx512_popcnt_intrinsics_$1])])dnl
|
||||
AC_CACHE_CHECK([for _mm512_popcnt_epi64 with CFLAGS=$1], [Ac_cachevar],
|
||||
[pgac_save_CFLAGS=$CFLAGS
|
||||
CFLAGS="$pgac_save_CFLAGS $1"
|
||||
AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <immintrin.h>],
|
||||
[const char buf@<:@sizeof(__m512i)@:>@;
|
||||
PG_INT64_TYPE popcnt = 0;
|
||||
__m512i accum = _mm512_setzero_si512();
|
||||
const __m512i val = _mm512_maskz_loadu_epi8((__mmask64) 0xf0f0f0f0f0f0f0f0, (const __m512i *) buf);
|
||||
const __m512i cnt = _mm512_popcnt_epi64(val);
|
||||
accum = _mm512_add_epi64(accum, cnt);
|
||||
popcnt = _mm512_reduce_add_epi64(accum);
|
||||
/* return computed value, to prevent the above being optimized away */
|
||||
return popcnt == 0;])],
|
||||
[Ac_cachevar=yes],
|
||||
[Ac_cachevar=no])
|
||||
CFLAGS="$pgac_save_CFLAGS"])
|
||||
if test x"$Ac_cachevar" = x"yes"; then
|
||||
CFLAGS_POPCNT="$1"
|
||||
pgac_avx512_popcnt_intrinsics=yes
|
||||
fi
|
||||
undefine([Ac_cachevar])dnl
|
||||
])# PGAC_AVX512_POPCNT_INTRINSICS
|
||||
|
|
252
configure
vendored
252
configure
vendored
|
@ -647,6 +647,9 @@ MSGFMT_FLAGS
|
|||
MSGFMT
|
||||
PG_CRC32C_OBJS
|
||||
CFLAGS_CRC
|
||||
PG_POPCNT_OBJS
|
||||
CFLAGS_POPCNT
|
||||
CFLAGS_XSAVE
|
||||
LIBOBJS
|
||||
OPENSSL
|
||||
ZSTD
|
||||
|
@ -17404,6 +17407,40 @@ $as_echo "#define HAVE__GET_CPUID 1" >>confdefs.h
|
|||
|
||||
fi
|
||||
|
||||
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __get_cpuid_count" >&5
|
||||
$as_echo_n "checking for __get_cpuid_count... " >&6; }
|
||||
if ${pgac_cv__get_cpuid_count+:} false; then :
|
||||
$as_echo_n "(cached) " >&6
|
||||
else
|
||||
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
||||
/* end confdefs.h. */
|
||||
#include <cpuid.h>
|
||||
int
|
||||
main ()
|
||||
{
|
||||
unsigned int exx[4] = {0, 0, 0, 0};
|
||||
__get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
|
||||
|
||||
;
|
||||
return 0;
|
||||
}
|
||||
_ACEOF
|
||||
if ac_fn_c_try_link "$LINENO"; then :
|
||||
pgac_cv__get_cpuid_count="yes"
|
||||
else
|
||||
pgac_cv__get_cpuid_count="no"
|
||||
fi
|
||||
rm -f core conftest.err conftest.$ac_objext \
|
||||
conftest$ac_exeext conftest.$ac_ext
|
||||
fi
|
||||
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__get_cpuid_count" >&5
|
||||
$as_echo "$pgac_cv__get_cpuid_count" >&6; }
|
||||
if test x"$pgac_cv__get_cpuid_count" = x"yes"; then
|
||||
|
||||
$as_echo "#define HAVE__GET_CPUID_COUNT 1" >>confdefs.h
|
||||
|
||||
fi
|
||||
|
||||
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __cpuid" >&5
|
||||
$as_echo_n "checking for __cpuid... " >&6; }
|
||||
if ${pgac_cv__cpuid+:} false; then :
|
||||
|
@ -17438,6 +17475,221 @@ $as_echo "#define HAVE__CPUID 1" >>confdefs.h
|
|||
|
||||
fi
|
||||
|
||||
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __cpuidex" >&5
|
||||
$as_echo_n "checking for __cpuidex... " >&6; }
|
||||
if ${pgac_cv__cpuidex+:} false; then :
|
||||
$as_echo_n "(cached) " >&6
|
||||
else
|
||||
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
||||
/* end confdefs.h. */
|
||||
#include <intrin.h>
|
||||
int
|
||||
main ()
|
||||
{
|
||||
unsigned int exx[4] = {0, 0, 0, 0};
|
||||
__get_cpuidex(exx[0], 7, 0);
|
||||
|
||||
;
|
||||
return 0;
|
||||
}
|
||||
_ACEOF
|
||||
if ac_fn_c_try_link "$LINENO"; then :
|
||||
pgac_cv__cpuidex="yes"
|
||||
else
|
||||
pgac_cv__cpuidex="no"
|
||||
fi
|
||||
rm -f core conftest.err conftest.$ac_objext \
|
||||
conftest$ac_exeext conftest.$ac_ext
|
||||
fi
|
||||
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__cpuidex" >&5
|
||||
$as_echo "$pgac_cv__cpuidex" >&6; }
|
||||
if test x"$pgac_cv__cpuidex" = x"yes"; then
|
||||
|
||||
$as_echo "#define HAVE__CPUIDEX 1" >>confdefs.h
|
||||
|
||||
fi
|
||||
|
||||
# Check for XSAVE intrinsics
|
||||
#
|
||||
CFLAGS_XSAVE=""
|
||||
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _xgetbv with CFLAGS=" >&5
|
||||
$as_echo_n "checking for _xgetbv with CFLAGS=... " >&6; }
|
||||
if ${pgac_cv_xsave_intrinsics_+:} false; then :
|
||||
$as_echo_n "(cached) " >&6
|
||||
else
|
||||
pgac_save_CFLAGS=$CFLAGS
|
||||
CFLAGS="$pgac_save_CFLAGS "
|
||||
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
||||
/* end confdefs.h. */
|
||||
#include <immintrin.h>
|
||||
int
|
||||
main ()
|
||||
{
|
||||
return _xgetbv(0) & 0xe0;
|
||||
;
|
||||
return 0;
|
||||
}
|
||||
_ACEOF
|
||||
if ac_fn_c_try_link "$LINENO"; then :
|
||||
pgac_cv_xsave_intrinsics_=yes
|
||||
else
|
||||
pgac_cv_xsave_intrinsics_=no
|
||||
fi
|
||||
rm -f core conftest.err conftest.$ac_objext \
|
||||
conftest$ac_exeext conftest.$ac_ext
|
||||
CFLAGS="$pgac_save_CFLAGS"
|
||||
fi
|
||||
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_xsave_intrinsics_" >&5
|
||||
$as_echo "$pgac_cv_xsave_intrinsics_" >&6; }
|
||||
if test x"$pgac_cv_xsave_intrinsics_" = x"yes"; then
|
||||
CFLAGS_XSAVE=""
|
||||
pgac_xsave_intrinsics=yes
|
||||
fi
|
||||
|
||||
if test x"$pgac_xsave_intrinsics" != x"yes"; then
|
||||
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _xgetbv with CFLAGS=-mxsave" >&5
|
||||
$as_echo_n "checking for _xgetbv with CFLAGS=-mxsave... " >&6; }
|
||||
if ${pgac_cv_xsave_intrinsics__mxsave+:} false; then :
|
||||
$as_echo_n "(cached) " >&6
|
||||
else
|
||||
pgac_save_CFLAGS=$CFLAGS
|
||||
CFLAGS="$pgac_save_CFLAGS -mxsave"
|
||||
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
||||
/* end confdefs.h. */
|
||||
#include <immintrin.h>
|
||||
int
|
||||
main ()
|
||||
{
|
||||
return _xgetbv(0) & 0xe0;
|
||||
;
|
||||
return 0;
|
||||
}
|
||||
_ACEOF
|
||||
if ac_fn_c_try_link "$LINENO"; then :
|
||||
pgac_cv_xsave_intrinsics__mxsave=yes
|
||||
else
|
||||
pgac_cv_xsave_intrinsics__mxsave=no
|
||||
fi
|
||||
rm -f core conftest.err conftest.$ac_objext \
|
||||
conftest$ac_exeext conftest.$ac_ext
|
||||
CFLAGS="$pgac_save_CFLAGS"
|
||||
fi
|
||||
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_xsave_intrinsics__mxsave" >&5
|
||||
$as_echo "$pgac_cv_xsave_intrinsics__mxsave" >&6; }
|
||||
if test x"$pgac_cv_xsave_intrinsics__mxsave" = x"yes"; then
|
||||
CFLAGS_XSAVE="-mxsave"
|
||||
pgac_xsave_intrinsics=yes
|
||||
fi
|
||||
|
||||
fi
|
||||
if test x"$pgac_xsave_intrinsics" = x"yes"; then
|
||||
|
||||
$as_echo "#define HAVE_XSAVE_INTRINSICS 1" >>confdefs.h
|
||||
|
||||
fi
|
||||
|
||||
|
||||
# Check for AVX-512 popcount intrinsics
|
||||
#
|
||||
CFLAGS_POPCNT=""
|
||||
PG_POPCNT_OBJS=""
|
||||
if test x"$host_cpu" = x"x86_64"; then
|
||||
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_popcnt_epi64 with CFLAGS=" >&5
|
||||
$as_echo_n "checking for _mm512_popcnt_epi64 with CFLAGS=... " >&6; }
|
||||
if ${pgac_cv_avx512_popcnt_intrinsics_+:} false; then :
|
||||
$as_echo_n "(cached) " >&6
|
||||
else
|
||||
pgac_save_CFLAGS=$CFLAGS
|
||||
CFLAGS="$pgac_save_CFLAGS "
|
||||
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
||||
/* end confdefs.h. */
|
||||
#include <immintrin.h>
|
||||
int
|
||||
main ()
|
||||
{
|
||||
const char buf[sizeof(__m512i)];
|
||||
PG_INT64_TYPE popcnt = 0;
|
||||
__m512i accum = _mm512_setzero_si512();
|
||||
const __m512i val = _mm512_maskz_loadu_epi8((__mmask64) 0xf0f0f0f0f0f0f0f0, (const __m512i *) buf);
|
||||
const __m512i cnt = _mm512_popcnt_epi64(val);
|
||||
accum = _mm512_add_epi64(accum, cnt);
|
||||
popcnt = _mm512_reduce_add_epi64(accum);
|
||||
/* return computed value, to prevent the above being optimized away */
|
||||
return popcnt == 0;
|
||||
;
|
||||
return 0;
|
||||
}
|
||||
_ACEOF
|
||||
if ac_fn_c_try_link "$LINENO"; then :
|
||||
pgac_cv_avx512_popcnt_intrinsics_=yes
|
||||
else
|
||||
pgac_cv_avx512_popcnt_intrinsics_=no
|
||||
fi
|
||||
rm -f core conftest.err conftest.$ac_objext \
|
||||
conftest$ac_exeext conftest.$ac_ext
|
||||
CFLAGS="$pgac_save_CFLAGS"
|
||||
fi
|
||||
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_popcnt_intrinsics_" >&5
|
||||
$as_echo "$pgac_cv_avx512_popcnt_intrinsics_" >&6; }
|
||||
if test x"$pgac_cv_avx512_popcnt_intrinsics_" = x"yes"; then
|
||||
CFLAGS_POPCNT=""
|
||||
pgac_avx512_popcnt_intrinsics=yes
|
||||
fi
|
||||
|
||||
if test x"$pgac_avx512_popcnt_intrinsics" != x"yes"; then
|
||||
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _mm512_popcnt_epi64 with CFLAGS=-mavx512vpopcntdq -mavx512bw" >&5
|
||||
$as_echo_n "checking for _mm512_popcnt_epi64 with CFLAGS=-mavx512vpopcntdq -mavx512bw... " >&6; }
|
||||
if ${pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512bw+:} false; then :
|
||||
$as_echo_n "(cached) " >&6
|
||||
else
|
||||
pgac_save_CFLAGS=$CFLAGS
|
||||
CFLAGS="$pgac_save_CFLAGS -mavx512vpopcntdq -mavx512bw"
|
||||
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
|
||||
/* end confdefs.h. */
|
||||
#include <immintrin.h>
|
||||
int
|
||||
main ()
|
||||
{
|
||||
const char buf[sizeof(__m512i)];
|
||||
PG_INT64_TYPE popcnt = 0;
|
||||
__m512i accum = _mm512_setzero_si512();
|
||||
const __m512i val = _mm512_maskz_loadu_epi8((__mmask64) 0xf0f0f0f0f0f0f0f0, (const __m512i *) buf);
|
||||
const __m512i cnt = _mm512_popcnt_epi64(val);
|
||||
accum = _mm512_add_epi64(accum, cnt);
|
||||
popcnt = _mm512_reduce_add_epi64(accum);
|
||||
/* return computed value, to prevent the above being optimized away */
|
||||
return popcnt == 0;
|
||||
;
|
||||
return 0;
|
||||
}
|
||||
_ACEOF
|
||||
if ac_fn_c_try_link "$LINENO"; then :
|
||||
pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512bw=yes
|
||||
else
|
||||
pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512bw=no
|
||||
fi
|
||||
rm -f core conftest.err conftest.$ac_objext \
|
||||
conftest$ac_exeext conftest.$ac_ext
|
||||
CFLAGS="$pgac_save_CFLAGS"
|
||||
fi
|
||||
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512bw" >&5
|
||||
$as_echo "$pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512bw" >&6; }
|
||||
if test x"$pgac_cv_avx512_popcnt_intrinsics__mavx512vpopcntdq__mavx512bw" = x"yes"; then
|
||||
CFLAGS_POPCNT="-mavx512vpopcntdq -mavx512bw"
|
||||
pgac_avx512_popcnt_intrinsics=yes
|
||||
fi
|
||||
|
||||
fi
|
||||
if test x"$pgac_avx512_popcnt_intrinsics" = x"yes"; then
|
||||
PG_POPCNT_OBJS="pg_popcount_avx512.o pg_popcount_avx512_choose.o"
|
||||
|
||||
$as_echo "#define USE_AVX512_POPCNT_WITH_RUNTIME_CHECK 1" >>confdefs.h
|
||||
|
||||
fi
|
||||
fi
|
||||
|
||||
|
||||
|
||||
# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
|
||||
#
|
||||
# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
|
||||
|
|
51
configure.ac
51
configure.ac
|
@ -2052,6 +2052,17 @@ if test x"$pgac_cv__get_cpuid" = x"yes"; then
|
|||
AC_DEFINE(HAVE__GET_CPUID, 1, [Define to 1 if you have __get_cpuid.])
|
||||
fi
|
||||
|
||||
AC_CACHE_CHECK([for __get_cpuid_count], [pgac_cv__get_cpuid_count],
|
||||
[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <cpuid.h>],
|
||||
[[unsigned int exx[4] = {0, 0, 0, 0};
|
||||
__get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
|
||||
]])],
|
||||
[pgac_cv__get_cpuid_count="yes"],
|
||||
[pgac_cv__get_cpuid_count="no"])])
|
||||
if test x"$pgac_cv__get_cpuid_count" = x"yes"; then
|
||||
AC_DEFINE(HAVE__GET_CPUID_COUNT, 1, [Define to 1 if you have __get_cpuid_count.])
|
||||
fi
|
||||
|
||||
AC_CACHE_CHECK([for __cpuid], [pgac_cv__cpuid],
|
||||
[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <intrin.h>],
|
||||
[[unsigned int exx[4] = {0, 0, 0, 0};
|
||||
|
@ -2063,6 +2074,46 @@ if test x"$pgac_cv__cpuid" = x"yes"; then
|
|||
AC_DEFINE(HAVE__CPUID, 1, [Define to 1 if you have __cpuid.])
|
||||
fi
|
||||
|
||||
AC_CACHE_CHECK([for __cpuidex], [pgac_cv__cpuidex],
|
||||
[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <intrin.h>],
|
||||
[[unsigned int exx[4] = {0, 0, 0, 0};
|
||||
__get_cpuidex(exx[0], 7, 0);
|
||||
]])],
|
||||
[pgac_cv__cpuidex="yes"],
|
||||
[pgac_cv__cpuidex="no"])])
|
||||
if test x"$pgac_cv__cpuidex" = x"yes"; then
|
||||
AC_DEFINE(HAVE__CPUIDEX, 1, [Define to 1 if you have __cpuidex.])
|
||||
fi
|
||||
|
||||
# Check for XSAVE intrinsics
|
||||
#
|
||||
CFLAGS_XSAVE=""
|
||||
PGAC_XSAVE_INTRINSICS([])
|
||||
if test x"$pgac_xsave_intrinsics" != x"yes"; then
|
||||
PGAC_XSAVE_INTRINSICS([-mxsave])
|
||||
fi
|
||||
if test x"$pgac_xsave_intrinsics" = x"yes"; then
|
||||
AC_DEFINE(HAVE_XSAVE_INTRINSICS, 1, [Define to 1 if you have XSAVE intrinsics.])
|
||||
fi
|
||||
AC_SUBST(CFLAGS_XSAVE)
|
||||
|
||||
# Check for AVX-512 popcount intrinsics
|
||||
#
|
||||
CFLAGS_POPCNT=""
|
||||
PG_POPCNT_OBJS=""
|
||||
if test x"$host_cpu" = x"x86_64"; then
|
||||
PGAC_AVX512_POPCNT_INTRINSICS([])
|
||||
if test x"$pgac_avx512_popcnt_intrinsics" != x"yes"; then
|
||||
PGAC_AVX512_POPCNT_INTRINSICS([-mavx512vpopcntdq -mavx512bw])
|
||||
fi
|
||||
if test x"$pgac_avx512_popcnt_intrinsics" = x"yes"; then
|
||||
PG_POPCNT_OBJS="pg_popcount_avx512.o pg_popcount_avx512_choose.o"
|
||||
AC_DEFINE(USE_AVX512_POPCNT_WITH_RUNTIME_CHECK, 1, [Define to 1 to use AVX-512 popcount instructions with a runtime check.])
|
||||
fi
|
||||
fi
|
||||
AC_SUBST(CFLAGS_POPCNT)
|
||||
AC_SUBST(PG_POPCNT_OBJS)
|
||||
|
||||
# Check for Intel SSE 4.2 intrinsics to do CRC calculations.
|
||||
#
|
||||
# First check if the _mm_crc32_u8 and _mm_crc32_u64 intrinsics can be used
|
||||
|
|
87
meson.build
87
meson.build
|
@ -1783,6 +1783,30 @@ elif cc.links('''
|
|||
endif
|
||||
|
||||
|
||||
# Check for __get_cpuid_count() and __cpuidex() in a similar fashion.
|
||||
if cc.links('''
|
||||
#include <cpuid.h>
|
||||
int main(int arg, char **argv)
|
||||
{
|
||||
unsigned int exx[4] = {0, 0, 0, 0};
|
||||
__get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
|
||||
}
|
||||
''', name: '__get_cpuid_count',
|
||||
args: test_c_args)
|
||||
cdata.set('HAVE__GET_CPUID_COUNT', 1)
|
||||
elif cc.links('''
|
||||
#include <intrin.h>
|
||||
int main(int arg, char **argv)
|
||||
{
|
||||
unsigned int exx[4] = {0, 0, 0, 0};
|
||||
__cpuidex(exx, 7, 0);
|
||||
}
|
||||
''', name: '__cpuidex',
|
||||
args: test_c_args)
|
||||
cdata.set('HAVE__CPUIDEX', 1)
|
||||
endif
|
||||
|
||||
|
||||
# Defend against clang being used on x86-32 without SSE2 enabled. As current
|
||||
# versions of clang do not understand -fexcess-precision=standard, the use of
|
||||
# x87 floating point operations leads to problems like isinf possibly returning
|
||||
|
@ -1996,6 +2020,69 @@ int main(void)
|
|||
endif
|
||||
|
||||
|
||||
###############################################################
|
||||
# Check for the availability of XSAVE intrinsics.
|
||||
###############################################################
|
||||
|
||||
cflags_xsave = []
|
||||
if host_cpu == 'x86' or host_cpu == 'x86_64'
|
||||
|
||||
prog = '''
|
||||
#include <immintrin.h>
|
||||
|
||||
int main(void)
|
||||
{
|
||||
return _xgetbv(0) & 0xe0;
|
||||
}
|
||||
'''
|
||||
|
||||
if cc.links(prog, name: 'XSAVE intrinsics without -mxsave',
|
||||
args: test_c_args)
|
||||
cdata.set('HAVE_XSAVE_INTRINSICS', 1)
|
||||
elif cc.links(prog, name: 'XSAVE intrinsics with -mxsave',
|
||||
args: test_c_args + ['-mxsave'])
|
||||
cdata.set('HAVE_XSAVE_INTRINSICS', 1)
|
||||
cflags_xsave += '-mxsave'
|
||||
endif
|
||||
|
||||
endif
|
||||
|
||||
|
||||
###############################################################
|
||||
# Check for the availability of AVX-512 popcount intrinsics.
|
||||
###############################################################
|
||||
|
||||
cflags_popcnt = []
|
||||
if host_cpu == 'x86_64'
|
||||
|
||||
prog = '''
|
||||
#include <immintrin.h>
|
||||
|
||||
int main(void)
|
||||
{
|
||||
const char buf[sizeof(__m512i)];
|
||||
INT64 popcnt = 0;
|
||||
__m512i accum = _mm512_setzero_si512();
|
||||
const __m512i val = _mm512_maskz_loadu_epi8((__mmask64) 0xf0f0f0f0f0f0f0f0, (const __m512i *) buf);
|
||||
const __m512i cnt = _mm512_popcnt_epi64(val);
|
||||
accum = _mm512_add_epi64(accum, cnt);
|
||||
popcnt = _mm512_reduce_add_epi64(accum);
|
||||
/* return computed value, to prevent the above being optimized away */
|
||||
return popcnt == 0;
|
||||
}
|
||||
'''
|
||||
|
||||
if cc.links(prog, name: 'AVX-512 popcount without -mavx512vpopcntdq -mavx512bw',
|
||||
args: test_c_args + ['-DINT64=@0@'.format(cdata.get('PG_INT64_TYPE'))])
|
||||
cdata.set('USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 1)
|
||||
elif cc.links(prog, name: 'AVX-512 popcount with -mavx512vpopcntdq -mavx512bw',
|
||||
args: test_c_args + ['-DINT64=@0@'.format(cdata.get('PG_INT64_TYPE'))] + ['-mavx512vpopcntdq'] + ['-mavx512bw'])
|
||||
cdata.set('USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 1)
|
||||
cflags_popcnt += ['-mavx512vpopcntdq'] + ['-mavx512bw']
|
||||
endif
|
||||
|
||||
endif
|
||||
|
||||
|
||||
###############################################################
|
||||
# Select CRC-32C implementation.
|
||||
|
|
|
@ -262,7 +262,9 @@ CFLAGS_SL_MODULE = @CFLAGS_SL_MODULE@
|
|||
CXXFLAGS_SL_MODULE = @CXXFLAGS_SL_MODULE@
|
||||
CFLAGS_UNROLL_LOOPS = @CFLAGS_UNROLL_LOOPS@
|
||||
CFLAGS_VECTORIZE = @CFLAGS_VECTORIZE@
|
||||
CFLAGS_POPCNT = @CFLAGS_POPCNT@
|
||||
CFLAGS_CRC = @CFLAGS_CRC@
|
||||
CFLAGS_XSAVE = @CFLAGS_XSAVE@
|
||||
PERMIT_DECLARATION_AFTER_STATEMENT = @PERMIT_DECLARATION_AFTER_STATEMENT@
|
||||
CXXFLAGS = @CXXFLAGS@
|
||||
|
||||
|
@ -758,6 +760,9 @@ LIBOBJS = @LIBOBJS@
|
|||
# files needed for the chosen CRC-32C implementation
|
||||
PG_CRC32C_OBJS = @PG_CRC32C_OBJS@
|
||||
|
||||
# files needed for the chosen popcount implementation
|
||||
PG_POPCNT_OBJS = @PG_POPCNT_OBJS@
|
||||
|
||||
LIBS := -lpgcommon -lpgport $(LIBS)
|
||||
|
||||
# to make ws2_32.lib the last library
|
||||
|
|
|
@ -513,6 +513,9 @@
|
|||
/* Define to 1 if the assembler supports X86_64's POPCNTQ instruction. */
|
||||
#undef HAVE_X86_64_POPCNTQ
|
||||
|
||||
/* Define to 1 if you have XSAVE intrinsics. */
|
||||
#undef HAVE_XSAVE_INTRINSICS
|
||||
|
||||
/* Define to 1 if the system has the type `_Bool'. */
|
||||
#undef HAVE__BOOL
|
||||
|
||||
|
@ -555,9 +558,15 @@
|
|||
/* Define to 1 if you have __cpuid. */
|
||||
#undef HAVE__CPUID
|
||||
|
||||
/* Define to 1 if you have __cpuidex. */
|
||||
#undef HAVE__CPUIDEX
|
||||
|
||||
/* Define to 1 if you have __get_cpuid. */
|
||||
#undef HAVE__GET_CPUID
|
||||
|
||||
/* Define to 1 if you have __get_cpuid_count. */
|
||||
#undef HAVE__GET_CPUID_COUNT
|
||||
|
||||
/* Define to 1 if your compiler understands _Static_assert. */
|
||||
#undef HAVE__STATIC_ASSERT
|
||||
|
||||
|
@ -680,6 +689,9 @@
|
|||
/* Define to 1 to build with assertion checks. (--enable-cassert) */
|
||||
#undef USE_ASSERT_CHECKING
|
||||
|
||||
/* Define to 1 to use AVX-512 popcount instructions with a runtime check. */
|
||||
#undef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
|
||||
|
||||
/* Define to 1 to build with Bonjour support. (--with-bonjour) */
|
||||
#undef USE_BONJOUR
|
||||
|
||||
|
|
|
@ -304,6 +304,17 @@ extern PGDLLIMPORT int (*pg_popcount32) (uint32 word);
|
|||
extern PGDLLIMPORT int (*pg_popcount64) (uint64 word);
|
||||
extern PGDLLIMPORT uint64 (*pg_popcount_optimized) (const char *buf, int bytes);
|
||||
|
||||
/*
|
||||
* We can also try to use the AVX-512 popcount instruction on some systems.
|
||||
* The implementation of that is located in its own file because it may
|
||||
* require special compiler flags that we don't want to apply to any other
|
||||
* files.
|
||||
*/
|
||||
#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
|
||||
extern bool pg_popcount_avx512_available(void);
|
||||
extern uint64 pg_popcount_avx512(const char *buf, int bytes);
|
||||
#endif
|
||||
|
||||
#else
|
||||
/* Use a portable implementation -- no need for a function pointer. */
|
||||
extern int pg_popcount32(uint32 word);
|
||||
|
|
|
@ -100,8 +100,10 @@ pgxs_kv = {
|
|||
' '.join(cflags_no_decl_after_statement),
|
||||
|
||||
'CFLAGS_CRC': ' '.join(cflags_crc),
|
||||
'CFLAGS_POPCNT': ' '.join(cflags_popcnt),
|
||||
'CFLAGS_UNROLL_LOOPS': ' '.join(unroll_loops_cflags),
|
||||
'CFLAGS_VECTORIZE': ' '.join(vectorize_cflags),
|
||||
'CFLAGS_XSAVE': ' '.join(cflags_xsave),
|
||||
|
||||
'LDFLAGS': var_ldflags,
|
||||
'LDFLAGS_EX': var_ldflags_ex,
|
||||
|
@ -177,7 +179,7 @@ pgxs_empty = [
|
|||
'WANTED_LANGUAGES',
|
||||
|
||||
# Not needed because we don't build the server / PLs with the generated makefile
|
||||
'LIBOBJS', 'PG_CRC32C_OBJS', 'TAS',
|
||||
'LIBOBJS', 'PG_CRC32C_OBJS', 'PG_POPCNT_OBJS', 'TAS',
|
||||
'DTRACEFLAGS', # only server has dtrace probes
|
||||
|
||||
'perl_archlibexp', 'perl_embed_ccflags', 'perl_embed_ldflags', 'perl_includespec', 'perl_privlibexp',
|
||||
|
|
|
@ -38,6 +38,7 @@ LIBS += $(PTHREAD_LIBS)
|
|||
OBJS = \
|
||||
$(LIBOBJS) \
|
||||
$(PG_CRC32C_OBJS) \
|
||||
$(PG_POPCNT_OBJS) \
|
||||
bsearch_arg.o \
|
||||
chklocale.o \
|
||||
inet_net_ntop.o \
|
||||
|
@ -92,6 +93,16 @@ pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC)
|
|||
pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC)
|
||||
pg_crc32c_armv8_srv.o: CFLAGS+=$(CFLAGS_CRC)
|
||||
|
||||
# all versions of pg_popcount_avx512_choose.o need CFLAGS_XSAVE
|
||||
pg_popcount_avx512_choose.o: CFLAGS+=$(CFLAGS_XSAVE)
|
||||
pg_popcount_avx512_choose_shlib.o: CFLAGS+=$(CFLAGS_XSAVE)
|
||||
pg_popcount_avx512_choose_srv.o: CFLAGS+=$(CFLAGS_XSAVE)
|
||||
|
||||
# all versions of pg_popcount_avx512.o need CFLAGS_POPCNT
|
||||
pg_popcount_avx512.o: CFLAGS+=$(CFLAGS_POPCNT)
|
||||
pg_popcount_avx512_shlib.o: CFLAGS+=$(CFLAGS_POPCNT)
|
||||
pg_popcount_avx512_srv.o: CFLAGS+=$(CFLAGS_POPCNT)
|
||||
|
||||
#
|
||||
# Shared library versions of object files
|
||||
#
|
||||
|
|
|
@ -84,6 +84,8 @@ replace_funcs_pos = [
|
|||
['pg_crc32c_sse42', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK', 'crc'],
|
||||
['pg_crc32c_sse42_choose', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
|
||||
['pg_crc32c_sb8', 'USE_SSE42_CRC32C_WITH_RUNTIME_CHECK'],
|
||||
['pg_popcount_avx512', 'USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 'popcnt'],
|
||||
['pg_popcount_avx512_choose', 'USE_AVX512_POPCNT_WITH_RUNTIME_CHECK', 'xsave'],
|
||||
|
||||
# arm / aarch64
|
||||
['pg_crc32c_armv8', 'USE_ARMV8_CRC32C'],
|
||||
|
@ -98,8 +100,8 @@ replace_funcs_pos = [
|
|||
['pg_crc32c_sb8', 'USE_SLICING_BY_8_CRC32C'],
|
||||
]
|
||||
|
||||
pgport_cflags = {'crc': cflags_crc}
|
||||
pgport_sources_cflags = {'crc': []}
|
||||
pgport_cflags = {'crc': cflags_crc, 'popcnt': cflags_popcnt, 'xsave': cflags_xsave}
|
||||
pgport_sources_cflags = {'crc': [], 'popcnt': [], 'xsave': []}
|
||||
|
||||
foreach f : replace_funcs_neg
|
||||
func = f.get(0)
|
||||
|
|
|
@ -163,6 +163,11 @@ choose_popcount_functions(void)
|
|||
pg_popcount64 = pg_popcount64_slow;
|
||||
pg_popcount_optimized = pg_popcount_slow;
|
||||
}
|
||||
|
||||
#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK
|
||||
if (pg_popcount_avx512_available())
|
||||
pg_popcount_optimized = pg_popcount_avx512;
|
||||
#endif
|
||||
}
|
||||
|
||||
static int
|
||||
|
|
81
src/port/pg_popcount_avx512.c
Normal file
81
src/port/pg_popcount_avx512.c
Normal file
|
@ -0,0 +1,81 @@
|
|||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* pg_popcount_avx512.c
|
||||
* Holds the AVX-512 pg_popcount() implementation.
|
||||
*
|
||||
* Copyright (c) 2024, PostgreSQL Global Development Group
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* src/port/pg_popcount_avx512.c
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#include "c.h"
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
#include "port/pg_bitutils.h"
|
||||
|
||||
/*
|
||||
* It's probably unlikely that TRY_POPCNT_FAST won't be set if we are able to
|
||||
* use AVX-512 intrinsics, but we check it anyway to be sure. We piggy-back on
|
||||
* the function pointers that are only used when TRY_POPCNT_FAST is set.
|
||||
*/
|
||||
#ifdef TRY_POPCNT_FAST
|
||||
|
||||
/*
|
||||
* pg_popcount_avx512
|
||||
* Returns the number of 1-bits in buf
|
||||
*/
|
||||
uint64
|
||||
pg_popcount_avx512(const char *buf, int bytes)
|
||||
{
|
||||
__m512i val,
|
||||
cnt;
|
||||
__m512i accum = _mm512_setzero_si512();
|
||||
const char *final;
|
||||
int tail_idx;
|
||||
__mmask64 mask = ~UINT64CONST(0);
|
||||
|
||||
/*
|
||||
* Align buffer down to avoid double load overhead from unaligned access.
|
||||
* Calculate a mask to ignore preceding bytes. Find start offset of final
|
||||
* iteration and ensure it is not empty.
|
||||
*/
|
||||
mask <<= ((uintptr_t) buf) % sizeof(__m512i);
|
||||
tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1;
|
||||
final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1);
|
||||
buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf);
|
||||
|
||||
/*
|
||||
* Iterate through all but the final iteration. Starting from the second
|
||||
* iteration, the mask is ignored.
|
||||
*/
|
||||
if (buf < final)
|
||||
{
|
||||
val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf);
|
||||
cnt = _mm512_popcnt_epi64(val);
|
||||
accum = _mm512_add_epi64(accum, cnt);
|
||||
|
||||
buf += sizeof(__m512i);
|
||||
mask = ~UINT64CONST(0);
|
||||
|
||||
for (; buf < final; buf += sizeof(__m512i))
|
||||
{
|
||||
val = _mm512_load_si512((const __m512i *) buf);
|
||||
cnt = _mm512_popcnt_epi64(val);
|
||||
accum = _mm512_add_epi64(accum, cnt);
|
||||
}
|
||||
}
|
||||
|
||||
/* Final iteration needs to ignore bytes that are not within the length */
|
||||
mask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx));
|
||||
|
||||
val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf);
|
||||
cnt = _mm512_popcnt_epi64(val);
|
||||
accum = _mm512_add_epi64(accum, cnt);
|
||||
|
||||
return _mm512_reduce_add_epi64(accum);
|
||||
}
|
||||
|
||||
#endif /* TRY_POPCNT_FAST */
|
88
src/port/pg_popcount_avx512_choose.c
Normal file
88
src/port/pg_popcount_avx512_choose.c
Normal file
|
@ -0,0 +1,88 @@
|
|||
/*-------------------------------------------------------------------------
|
||||
*
|
||||
* pg_popcount_avx512_choose.c
|
||||
* Test whether we can use the AVX-512 pg_popcount() implementation.
|
||||
*
|
||||
* Copyright (c) 2024, PostgreSQL Global Development Group
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* src/port/pg_popcount_avx512_choose.c
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
#include "c.h"
|
||||
|
||||
#if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT)
|
||||
#include <cpuid.h>
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_XSAVE_INTRINSICS
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
|
||||
#if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX)
|
||||
#include <intrin.h>
|
||||
#endif
|
||||
|
||||
#include "port/pg_bitutils.h"
|
||||
|
||||
/*
|
||||
* It's probably unlikely that TRY_POPCNT_FAST won't be set if we are able to
|
||||
* use AVX-512 intrinsics, but we check it anyway to be sure. We piggy-back on
|
||||
* the function pointers that are only used when TRY_POPCNT_FAST is set.
|
||||
*/
|
||||
#ifdef TRY_POPCNT_FAST
|
||||
|
||||
/*
|
||||
* Returns true if the CPU supports the instructions required for the AVX-512
|
||||
* pg_popcount() implementation.
|
||||
*/
|
||||
bool
|
||||
pg_popcount_avx512_available(void)
|
||||
{
|
||||
unsigned int exx[4] = {0, 0, 0, 0};
|
||||
|
||||
/* Does CPUID say there's support for AVX-512 popcount instructions? */
|
||||
#if defined(HAVE__GET_CPUID_COUNT)
|
||||
__get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
|
||||
#elif defined(HAVE__CPUIDEX)
|
||||
__cpuidex(exx, 7, 0);
|
||||
#else
|
||||
#error cpuid instruction not available
|
||||
#endif
|
||||
if ((exx[2] & (1 << 14)) == 0) /* avx512-vpopcntdq */
|
||||
return false;
|
||||
|
||||
/* Does CPUID say there's support for AVX-512 byte and word instructions? */
|
||||
memset(exx, 0, sizeof(exx));
|
||||
#if defined(HAVE__GET_CPUID_COUNT)
|
||||
__get_cpuid_count(7, 0, &exx[0], &exx[1], &exx[2], &exx[3]);
|
||||
#elif defined(HAVE__CPUIDEX)
|
||||
__cpuidex(exx, 7, 0);
|
||||
#else
|
||||
#error cpuid instruction not available
|
||||
#endif
|
||||
if ((exx[1] & (1 << 30)) == 0) /* avx512-bw */
|
||||
return false;
|
||||
|
||||
/* Does CPUID say there's support for XSAVE instructions? */
|
||||
memset(exx, 0, sizeof(exx));
|
||||
#if defined(HAVE__GET_CPUID)
|
||||
__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
|
||||
#elif defined(HAVE__CPUID)
|
||||
__cpuid(exx, 1);
|
||||
#else
|
||||
#error cpuid instruction not available
|
||||
#endif
|
||||
if ((exx[2] & (1 << 26)) == 0) /* xsave */
|
||||
return false;
|
||||
|
||||
/* Does XGETBV say the ZMM registers are enabled? */
|
||||
#ifdef HAVE_XSAVE_INTRINSICS
|
||||
return (_xgetbv(0) & 0xe0) != 0;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif /* TRY_POPCNT_FAST */
|
|
@ -740,6 +740,30 @@ SELECT bit_count(B'1111111111'::bit(10));
|
|||
10
|
||||
(1 row)
|
||||
|
||||
SELECT bit_count(repeat('0', 100)::bit(100));
|
||||
bit_count
|
||||
-----------
|
||||
0
|
||||
(1 row)
|
||||
|
||||
SELECT bit_count(repeat('1', 100)::bit(100));
|
||||
bit_count
|
||||
-----------
|
||||
100
|
||||
(1 row)
|
||||
|
||||
SELECT bit_count(repeat('01', 500)::bit(1000));
|
||||
bit_count
|
||||
-----------
|
||||
500
|
||||
(1 row)
|
||||
|
||||
SELECT bit_count(repeat('10101', 200)::bit(1000));
|
||||
bit_count
|
||||
-----------
|
||||
600
|
||||
(1 row)
|
||||
|
||||
-- This table is intentionally left around to exercise pg_dump/pg_upgrade
|
||||
CREATE TABLE bit_defaults(
|
||||
b1 bit(4) DEFAULT '1001',
|
||||
|
|
|
@ -223,6 +223,10 @@ SELECT overlay(B'0101011100' placing '001' from 20);
|
|||
-- bit_count
|
||||
SELECT bit_count(B'0101011100'::bit(10));
|
||||
SELECT bit_count(B'1111111111'::bit(10));
|
||||
SELECT bit_count(repeat('0', 100)::bit(100));
|
||||
SELECT bit_count(repeat('1', 100)::bit(100));
|
||||
SELECT bit_count(repeat('01', 500)::bit(1000));
|
||||
SELECT bit_count(repeat('10101', 200)::bit(1000));
|
||||
|
||||
-- This table is intentionally left around to exercise pg_dump/pg_upgrade
|
||||
CREATE TABLE bit_defaults(
|
||||
|
|
Loading…
Reference in a new issue