[PATCH] Add loongarch native checksum implementation.

Started by YANG Xudongover 2 years ago24 messages
#1YANG Xudong
yangxudong@ymatrix.cn
1 attachment(s)

Attachments:

0001-Add-loongarch-native-checksum-implementation.patchtext/plain; charset=UTF-8; name=0001-Add-loongarch-native-checksum-implementation.patchDownload
From 6ccee980a42f1706094a51fa146c55edc3adfbb1 Mon Sep 17 00:00:00 2001
From: YANG Xudong <yangxudong@ymatrix.cn>
Date: Thu, 8 Jun 2023 13:22:03 +0800
Subject: [PATCH] Add loongarch native checksum implementation.

---
 config/c-compiler.m4           |  34 ++++++++++
 configure                      | 118 +++++++++++++++++++++++++++++++--
 configure.ac                   |  39 ++++++++---
 meson.build                    |  29 ++++++++
 src/include/pg_config.h.in     |   3 +
 src/include/port/pg_crc32c.h   |   9 +++
 src/port/Makefile              |   5 ++
 src/port/meson.build           |   3 +
 src/port/pg_crc32c_loongarch.c |  72 ++++++++++++++++++++
 9 files changed, 297 insertions(+), 15 deletions(-)
 create mode 100644 src/port/pg_crc32c_loongarch.c

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 5be8f0f08d..eb3af009c4 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -661,3 +661,37 @@ if test x"$Ac_cachevar" = x"yes"; then
 fi
 undefine([Ac_cachevar])dnl
 ])# PGAC_ARMV8_CRC32C_INTRINSICS
+
+# PGAC_LOONGARCH_CRC32C_INTRINSICS
+# ---------------------------
+# Check if the compiler supports the LoongArch CRCC instructions, using
+# __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w,
+# __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w
+# intrinsic functions.
+#
+# If the intrinsics are supported, sets pgac_loongarch_crc32c_intrinsics,
+# and CFLAGS_CRC.
+AC_DEFUN([PGAC_LOONGARCH_CRC32C_INTRINSICS],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_loongarch_crc32c_intrinsics_$1])])dnl
+AC_CACHE_CHECK(
+  [for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w with CFLAGS=$1],
+  [Ac_cachevar],
+[pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS $1"
+AC_LINK_IFELSE([AC_LANG_PROGRAM([],
+  [unsigned int crc = 0;
+   crc = __builtin_loongarch_crcc_w_b_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_h_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_w_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_d_w(0, crc);
+   /* return computed value, to prevent the above being optimized away */
+   return crc == 0;])],
+  [Ac_cachevar=yes],
+  [Ac_cachevar=no])
+CFLAGS="$pgac_save_CFLAGS"])
+if test x"$Ac_cachevar" = x"yes"; then
+  CFLAGS_CRC="$1"
+  pgac_loongarch_crc32c_intrinsics=yes
+fi
+undefine([Ac_cachevar])dnl
+])# PGAC_LOONGARCH_CRC32C_INTRINSICS
diff --git a/configure b/configure
index 1b415142d1..c5d8c2e4f6 100755
--- a/configure
+++ b/configure
@@ -18116,6 +18116,96 @@ fi
 
 
 
+# Check for LoongArch CRC intrinsics to do CRC calculations.
+#
+# Check if __builtin_loongarch_crcc_* intrinsics can be used
+# with the default compiler flags.
+# CFLAGS_CRC is set if the extra flag is required.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w with CFLAGS=" >&5
+$as_echo_n "checking for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w with CFLAGS=... " >&6; }
+if ${pgac_cv_loongarch_crc32c_intrinsics_+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS "
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+unsigned int crc = 0;
+   crc = __builtin_loongarch_crcc_w_b_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_h_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_w_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_d_w(0, crc);
+   /* return computed value, to prevent the above being optimized away */
+   return crc == 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_loongarch_crc32c_intrinsics_=yes
+else
+  pgac_cv_loongarch_crc32c_intrinsics_=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+CFLAGS="$pgac_save_CFLAGS"
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_loongarch_crc32c_intrinsics_" >&5
+$as_echo "$pgac_cv_loongarch_crc32c_intrinsics_" >&6; }
+if test x"$pgac_cv_loongarch_crc32c_intrinsics_" = x"yes"; then
+  CFLAGS_CRC=""
+  pgac_loongarch_crc32c_intrinsics=yes
+fi
+
+if test x"$pgac_loongarch_crc32c_intrinsics" != x"yes"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w with CFLAGS=-march=loongarch64" >&5
+$as_echo_n "checking for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w with CFLAGS=-march=loongarch64... " >&6; }
+if ${pgac_cv_loongarch_crc32c_intrinsics__march_loongarch64+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS -march=loongarch64"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+unsigned int crc = 0;
+   crc = __builtin_loongarch_crcc_w_b_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_h_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_w_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_d_w(0, crc);
+   /* return computed value, to prevent the above being optimized away */
+   return crc == 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_loongarch_crc32c_intrinsics__march_loongarch64=yes
+else
+  pgac_cv_loongarch_crc32c_intrinsics__march_loongarch64=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+CFLAGS="$pgac_save_CFLAGS"
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_loongarch_crc32c_intrinsics__march_loongarch64" >&5
+$as_echo "$pgac_cv_loongarch_crc32c_intrinsics__march_loongarch64" >&6; }
+if test x"$pgac_cv_loongarch_crc32c_intrinsics__march_loongarch64" = x"yes"; then
+  CFLAGS_CRC="-march=loongarch64"
+  pgac_loongarch_crc32c_intrinsics=yes
+fi
+
+fi
+
+
+
 # Select CRC-32C implementation.
 #
 # If we are targeting a processor that has Intel SSE 4.2 instructions, we can
@@ -18132,7 +18222,7 @@ fi
 #
 # You can override this logic by setting the appropriate USE_*_CRC32 flag to 1
 # in the template or configure command line.
-if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
+if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then
   # Use Intel SSE 4.2 if available.
   if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
     USE_SSE42_CRC32C=1
@@ -18150,10 +18240,15 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
         if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then
           USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1
         else
-          # fall back to slicing-by-8 algorithm, which doesn't require any
-          # special CPU support.
-          USE_SLICING_BY_8_CRC32C=1
-	fi
+          if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then
+            # LoongArch CRCC instructions.
+            USE_LOONGARCH_CRC32C=1
+          else
+            # fall back to slicing-by-8 algorithm, which doesn't require any
+            # special CPU support.
+            USE_SLICING_BY_8_CRC32C=1
+          fi
+        fi
       fi
     fi
   fi
@@ -18194,12 +18289,21 @@ $as_echo "#define USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h
         { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions with runtime check" >&5
 $as_echo "ARMv8 CRC instructions with runtime check" >&6; }
       else
+        if test x"$USE_LOONGARCH_CRC32C" = x"1"; then
+
+$as_echo "#define USE_LOONGARCH_CRC32C 1" >>confdefs.h
+
+          PG_CRC32C_OBJS="pg_crc32c_loongarch.o"
+          { $as_echo "$as_me:${as_lineno-$LINENO}: result: LoongArch CRCC instructions" >&5
+$as_echo "LoongArch CRCC instructions" >&6; }
+        else
 
 $as_echo "#define USE_SLICING_BY_8_CRC32C 1" >>confdefs.h
 
-        PG_CRC32C_OBJS="pg_crc32c_sb8.o"
-        { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5
+          PG_CRC32C_OBJS="pg_crc32c_sb8.o"
+          { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5
 $as_echo "slicing-by-8" >&6; }
+        fi
       fi
     fi
   fi
diff --git a/configure.ac b/configure.ac
index 09558ada0f..5dcb1c4ab9 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2128,6 +2128,18 @@ fi
 
 AC_SUBST(CFLAGS_CRC)
 
+# Check for LoongArch CRC intrinsics to do CRC calculations.
+#
+# Check if __builtin_loongarch_crcc_* intrinsics can be used
+# with the default compiler flags.
+# CFLAGS_CRC is set if the extra flag is required.
+PGAC_LOONGARCH_CRC32C_INTRINSICS([])
+if test x"$pgac_loongarch_crc32c_intrinsics" != x"yes"; then
+  PGAC_LOONGARCH_CRC32C_INTRINSICS([-march=loongarch64])
+fi
+
+AC_SUBST(CFLAGS_CRC)
+
 # Select CRC-32C implementation.
 #
 # If we are targeting a processor that has Intel SSE 4.2 instructions, we can
@@ -2144,7 +2156,7 @@ AC_SUBST(CFLAGS_CRC)
 #
 # You can override this logic by setting the appropriate USE_*_CRC32 flag to 1
 # in the template or configure command line.
-if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
+if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then
   # Use Intel SSE 4.2 if available.
   if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
     USE_SSE42_CRC32C=1
@@ -2162,10 +2174,15 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
         if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then
           USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1
         else
-          # fall back to slicing-by-8 algorithm, which doesn't require any
-          # special CPU support.
-          USE_SLICING_BY_8_CRC32C=1
-	fi
+          if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then
+            # LoongArch CRCC instructions.
+            USE_LOONGARCH_CRC32C=1
+          else
+            # fall back to slicing-by-8 algorithm, which doesn't require any
+            # special CPU support.
+            USE_SLICING_BY_8_CRC32C=1
+          fi
+        fi
       fi
     fi
   fi
@@ -2193,9 +2210,15 @@ else
         PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o"
         AC_MSG_RESULT(ARMv8 CRC instructions with runtime check)
       else
-        AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).])
-        PG_CRC32C_OBJS="pg_crc32c_sb8.o"
-        AC_MSG_RESULT(slicing-by-8)
+        if test x"$USE_LOONGARCH_CRC32C" = x"1"; then
+          AC_DEFINE(USE_LOONGARCH_CRC32C, 1, [Define to 1 to use LoongArch CRCC instructions.])
+          PG_CRC32C_OBJS="pg_crc32c_loongarch.o"
+          AC_MSG_RESULT(LoongArch CRCC instructions)
+        else
+          AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).])
+          PG_CRC32C_OBJS="pg_crc32c_sb8.o"
+          AC_MSG_RESULT(slicing-by-8)
+        fi
       fi
     fi
   fi
diff --git a/meson.build b/meson.build
index 16b2e86646..5de8a85c94 100644
--- a/meson.build
+++ b/meson.build
@@ -2073,6 +2073,35 @@ int main(void)
     cdata.set('USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 1)
     have_optimized_crc = true
   endif
+
+elif host_cpu == 'loongarch64'
+
+  prog = '''
+int main(void)
+{
+    unsigned int crc = 0;
+    crc = __builtin_loongarch_crcc_w_b_w(0, crc);
+    crc = __builtin_loongarch_crcc_w_h_w(0, crc);
+    crc = __builtin_loongarch_crcc_w_w_w(0, crc);
+    crc = __builtin_loongarch_crcc_w_d_w(0, crc);
+
+    /* return computed value, to prevent the above being optimized away */
+    return crc == 0;
+}
+'''
+
+  if cc.links(prog, name: '__builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w, and __builtin_loongarch_crcc_w_d_w without -march=loongarch64',
+      args: test_c_args)
+    # Use LoongArch CRC instruction unconditionally
+    cdata.set('USE_LOONGARCH_CRC32C', 1)
+    have_optimized_crc = true
+  elif cc.links(prog, name: '__builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w, and __builtin_loongarch_crcc_w_d_w with -march=loongarch64',
+      args: test_c_args + ['-march=loongarch64'])
+    # Use LoongArch CRC instruction unconditionally
+    cdata.set('USE_LOONGARCH_CRC32C', 1)
+    have_optimized_crc = true
+  endif
+
 endif
 
 if not have_optimized_crc
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 6d572c3820..1f253566e4 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -724,6 +724,9 @@
 /* Define to 1 to build with LLVM based JIT support. (--with-llvm) */
 #undef USE_LLVM
 
+/* Define to 1 to use LoongArch CRCC instructions. */
+#undef USE_LOONGARCH_CRC32C
+
 /* Define to 1 to build with LZ4 support. (--with-lz4) */
 #undef USE_LZ4
 
diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h
index 7f8779261c..d085f1dc00 100644
--- a/src/include/port/pg_crc32c.h
+++ b/src/include/port/pg_crc32c.h
@@ -58,6 +58,15 @@ extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t le
 
 extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len);
 
+#elif defined(USE_LOONGARCH_CRC32C)
+/* Use LoongArch CRCC instructions. */
+
+#define COMP_CRC32C(crc, data, len)							\
+	((crc) = pg_comp_crc32c_loongarch((crc), (data), (len)))
+#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
+
+extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len);
+
 #elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) || defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
 
 /*
diff --git a/src/port/Makefile b/src/port/Makefile
index 711f59e32b..1933b0e750 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -98,6 +98,11 @@ pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC)
 pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC)
 pg_crc32c_armv8_srv.o: CFLAGS+=$(CFLAGS_CRC)
 
+# all versions of pg_crc32c_loongarch.o need CFLAGS_CRC
+pg_crc32c_loongarch.o: CFLAGS+=$(CFLAGS_CRC)
+pg_crc32c_loongarch_shlib.o: CFLAGS+=$(CFLAGS_CRC)
+pg_crc32c_loongarch_srv.o: CFLAGS+=$(CFLAGS_CRC)
+
 #
 # Shared library versions of object files
 #
diff --git a/src/port/meson.build b/src/port/meson.build
index 24416b9bfc..3e77b2493a 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -92,6 +92,9 @@ replace_funcs_pos = [
   ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'],
   ['pg_crc32c_sb8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'],
 
+  # loongarch
+  ['pg_crc32c_loongarch', 'USE_LOONGARCH_CRC32C'],
+
   # generic fallback
   ['pg_crc32c_sb8', 'USE_SLICING_BY_8_CRC32C'],
 ]
diff --git a/src/port/pg_crc32c_loongarch.c b/src/port/pg_crc32c_loongarch.c
new file mode 100644
index 0000000000..b54fe12ff5
--- /dev/null
+++ b/src/port/pg_crc32c_loongarch.c
@@ -0,0 +1,72 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_crc32c_loongarch.c
+ *	  Compute CRC-32C checksum using LoongArch CRCC instructions
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_crc32c_loongarch.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "c.h"
+
+#include "port/pg_crc32c.h"
+
+pg_crc32c
+pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len)
+{
+	const unsigned char *p = data;
+	const unsigned char *pend = p + len;
+
+	/*
+	 * Aligned memory access is significantly faster.
+	 * Process leading bytes so that the loop below starts with a pointer aligned to eight bytes.
+	 */
+	if (!PointerIsAligned(p, uint16) &&
+		p + 1 <= pend)
+	{
+		crc = __builtin_loongarch_crcc_w_b_w(*p, crc);
+		p += 1;
+	}
+	if (!PointerIsAligned(p, uint32) &&
+		p + 2 <= pend)
+	{
+		crc = __builtin_loongarch_crcc_w_h_w(*(uint16 *) p, crc);
+		p += 2;
+	}
+	if (!PointerIsAligned(p, uint64) &&
+		p + 4 <= pend)
+	{
+		crc = __builtin_loongarch_crcc_w_w_w(*(uint32 *) p, crc);
+		p += 4;
+	}
+
+	/* Process eight bytes at a time, as far as we can. */
+	while (p + 8 <= pend)
+	{
+		crc = __builtin_loongarch_crcc_w_d_w(*(uint64 *) p, crc);
+		p += 8;
+	}
+
+	/* Process remaining 0-7 bytes. */
+	if (p + 4 <= pend)
+	{
+		crc = __builtin_loongarch_crcc_w_w_w(*(uint32 *) p, crc);
+		p += 4;
+	}
+	if (p + 2 <= pend)
+	{
+		crc = __builtin_loongarch_crcc_w_h_w(*(uint16 *) p, crc);
+		p += 2;
+	}
+	if (p < pend)
+	{
+		crc = __builtin_loongarch_crcc_w_b_w(*p, crc);
+	}
+
+	return crc;
+}
-- 
2.41.0

#2John Naylor
john.naylor@enterprisedb.com
In reply to: YANG Xudong (#1)
Re: [PATCH] Add loongarch native checksum implementation.

On Thu, Jun 8, 2023 at 12:24 PM YANG Xudong <yangxudong@ymatrix.cn> wrote:

This patch tries to add loongarch native crc32 check with crcc.*
instructions to postgresql.

The patch is tested on my Loongson 3A5000 machine with Loong Arch Linux
and GCC 13.1.0 / clang 16.0.0 with

- default ./configure
- default meson setup

I took a quick look at this, and it seems mostly in line with other
architectures we support for CRC. I have a couple questions and comments:

configure.ac:

+AC_SUBST(CFLAGS_CRC)

This seems to be an unnecessary copy-paste. I think we only need one, after
all checks have run.

meson.build

+  if cc.links(prog, name: '__builtin_loongarch_crcc_w_b_w,
__builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w, and
__builtin_loongarch_crcc_w_d_w without -march=loongarch64',
+      args: test_c_args)
+    # Use LoongArch CRC instruction unconditionally
+    cdata.set('USE_LOONGARCH_CRC32C', 1)
+    have_optimized_crc = true
+  elif cc.links(prog, name: '__builtin_loongarch_crcc_w_b_w,
__builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w, and
__builtin_loongarch_crcc_w_d_w with -march=loongarch64',
+      args: test_c_args + ['-march=loongarch64'])
+    # Use LoongArch CRC instruction unconditionally

For x86 and Arm, if it fails to link without an -march flag, we allow for a
runtime check. The flags "-march=armv8-a+crc" and "-msse4.2" are for
instructions not found on all platforms. The patch also checks both ways,
and each one results in "Use LoongArch CRC instruction unconditionally".
The -march flag here is general, not specific. In other words, if this only
runs inside "+elif host_cpu == 'loongarch64'", why do we need both with
-march and without?

Also, I don't have a Loongarch machine for testing. Could you show that the
instructions are found in the binary, maybe using objdump and grep? Or a
performance test?

In the future, you may also consider running the buildfarm client on a
machine dedicated for testing. That will give us quick feedback if some
future new code doesn't work on this platform. More information here:

https://wiki.postgresql.org/wiki/PostgreSQL_Buildfarm_Howto

--
John Naylor
EDB: http://www.enterprisedb.com

#3YANG Xudong
yangxudong@ymatrix.cn
In reply to: John Naylor (#2)
3 attachment(s)
Re: [PATCH] Add loongarch native checksum implementation.

Attached a new patch with fixes based on the comment below.

On 2023/6/13 18:26, John Naylor wrote:

On Thu, Jun 8, 2023 at 12:24 PM YANG Xudong <yangxudong@ymatrix.cn
<mailto:yangxudong@ymatrix.cn>> wrote:

This patch tries to add loongarch native crc32 check with crcc.*
instructions to postgresql.

The patch is tested on my Loongson 3A5000 machine with Loong Arch Linux
and GCC 13.1.0 / clang 16.0.0 with

- default ./configure
- default meson setup

I took a quick look at this, and it seems mostly in line with other
architectures we support for CRC. I have a couple questions and comments:

configure.ac <http://configure.ac&gt;:

+AC_SUBST(CFLAGS_CRC)

This seems to be an unnecessary copy-paste. I think we only need one,

after all checks have run.

Removed the extra line.

meson.build

+  if cc.links(prog, name: '__builtin_loongarch_crcc_w_b_w, 
__builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w, and 
__builtin_loongarch_crcc_w_d_w without -march=loongarch64',
+      args: test_c_args)
+    # Use LoongArch CRC instruction unconditionally
+    cdata.set('USE_LOONGARCH_CRC32C', 1)
+    have_optimized_crc = true
+  elif cc.links(prog, name: '__builtin_loongarch_crcc_w_b_w, 
__builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w, and 
__builtin_loongarch_crcc_w_d_w with -march=loongarch64',
+      args: test_c_args + ['-march=loongarch64'])
+    # Use LoongArch CRC instruction unconditionally

For x86 and Arm, if it fails to link without an -march flag, we allow
for a runtime check. The flags "-march=armv8-a+crc" and "-msse4.2" are
for instructions not found on all platforms. The patch also checks both
ways, and each one results in "Use LoongArch CRC instruction
unconditionally". The -march flag here is general, not specific. In
other words, if this only runs inside "+elif host_cpu == 'loongarch64'",
why do we need both with -march and without?

Removed the elif branch.

Also, I don't have a Loongarch machine for testing. Could you show that
the instructions are found in the binary, maybe using objdump and grep?
Or a performance test?

The output of the objdump command `objdump -dS
../postgres-build/tmp_install/usr/local/pgsql/bin/postgres | grep -B 30
-A 10 crcc` is attached.

Also the output of make check is attached.

I run a simple test program to compare the performance of
pg_comp_crc32c_loongarch and pg_comp_crc32c_sb8 on my test machine. The
result is that pg_comp_crc32c_loongarch is over 2x faster than
pg_comp_crc32c_sb8.

In the future, you may also consider running the buildfarm client on a
machine dedicated for testing. That will give us quick feedback if some
future new code doesn't work on this platform. More information here:

https://wiki.postgresql.org/wiki/PostgreSQL_Buildfarm_Howto
<https://wiki.postgresql.org/wiki/PostgreSQL_Buildfarm_Howto&gt;

I will contact the loongson community
(https://github.com/loongson-community) to see if they are able to
provide some machine for buildfarm or not.

--
John Naylor
EDB: http://www.enterprisedb.com <http://www.enterprisedb.com&gt;

--
YANG Xudong

Attachments:

0001-Add-loongarch-native-checksum-implementation.patchtext/plain; charset=UTF-8; name=0001-Add-loongarch-native-checksum-implementation.patchDownload
From c6441c9e446d13ddf60b1f5432114a5289968403 Mon Sep 17 00:00:00 2001
From: YANG Xudong <yangxudong@ymatrix.cn>
Date: Wed, 14 Jun 2023 09:49:49 +0800
Subject: [PATCH] Add loongarch native checksum implementation.

---
 config/c-compiler.m4           |  34 ++++++++++
 configure                      | 116 +++++++++++++++++++++++++++++++--
 configure.ac                   |  37 ++++++++---
 meson.build                    |  24 +++++++
 src/include/pg_config.h.in     |   3 +
 src/include/port/pg_crc32c.h   |   9 +++
 src/port/Makefile              |   5 ++
 src/port/meson.build           |   3 +
 src/port/pg_crc32c_loongarch.c |  72 ++++++++++++++++++++
 9 files changed, 288 insertions(+), 15 deletions(-)
 create mode 100644 src/port/pg_crc32c_loongarch.c

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 5be8f0f08d..eb3af009c4 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -661,3 +661,37 @@ if test x"$Ac_cachevar" = x"yes"; then
 fi
 undefine([Ac_cachevar])dnl
 ])# PGAC_ARMV8_CRC32C_INTRINSICS
+
+# PGAC_LOONGARCH_CRC32C_INTRINSICS
+# ---------------------------
+# Check if the compiler supports the LoongArch CRCC instructions, using
+# __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w,
+# __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w
+# intrinsic functions.
+#
+# If the intrinsics are supported, sets pgac_loongarch_crc32c_intrinsics,
+# and CFLAGS_CRC.
+AC_DEFUN([PGAC_LOONGARCH_CRC32C_INTRINSICS],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_loongarch_crc32c_intrinsics_$1])])dnl
+AC_CACHE_CHECK(
+  [for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w with CFLAGS=$1],
+  [Ac_cachevar],
+[pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS $1"
+AC_LINK_IFELSE([AC_LANG_PROGRAM([],
+  [unsigned int crc = 0;
+   crc = __builtin_loongarch_crcc_w_b_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_h_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_w_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_d_w(0, crc);
+   /* return computed value, to prevent the above being optimized away */
+   return crc == 0;])],
+  [Ac_cachevar=yes],
+  [Ac_cachevar=no])
+CFLAGS="$pgac_save_CFLAGS"])
+if test x"$Ac_cachevar" = x"yes"; then
+  CFLAGS_CRC="$1"
+  pgac_loongarch_crc32c_intrinsics=yes
+fi
+undefine([Ac_cachevar])dnl
+])# PGAC_LOONGARCH_CRC32C_INTRINSICS
diff --git a/configure b/configure
index 1b415142d1..fa4a0fdcb3 100755
--- a/configure
+++ b/configure
@@ -18114,6 +18114,94 @@ fi
 
 fi
 
+# Check for LoongArch CRC intrinsics to do CRC calculations.
+#
+# Check if __builtin_loongarch_crcc_* intrinsics can be used
+# with the default compiler flags.
+# CFLAGS_CRC is set if the extra flag is required.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w with CFLAGS=" >&5
+$as_echo_n "checking for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w with CFLAGS=... " >&6; }
+if ${pgac_cv_loongarch_crc32c_intrinsics_+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS "
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+unsigned int crc = 0;
+   crc = __builtin_loongarch_crcc_w_b_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_h_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_w_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_d_w(0, crc);
+   /* return computed value, to prevent the above being optimized away */
+   return crc == 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_loongarch_crc32c_intrinsics_=yes
+else
+  pgac_cv_loongarch_crc32c_intrinsics_=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+CFLAGS="$pgac_save_CFLAGS"
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_loongarch_crc32c_intrinsics_" >&5
+$as_echo "$pgac_cv_loongarch_crc32c_intrinsics_" >&6; }
+if test x"$pgac_cv_loongarch_crc32c_intrinsics_" = x"yes"; then
+  CFLAGS_CRC=""
+  pgac_loongarch_crc32c_intrinsics=yes
+fi
+
+if test x"$pgac_loongarch_crc32c_intrinsics" != x"yes"; then
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w with CFLAGS=-march=loongarch64" >&5
+$as_echo_n "checking for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w with CFLAGS=-march=loongarch64... " >&6; }
+if ${pgac_cv_loongarch_crc32c_intrinsics__march_loongarch64+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS -march=loongarch64"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+unsigned int crc = 0;
+   crc = __builtin_loongarch_crcc_w_b_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_h_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_w_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_d_w(0, crc);
+   /* return computed value, to prevent the above being optimized away */
+   return crc == 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_loongarch_crc32c_intrinsics__march_loongarch64=yes
+else
+  pgac_cv_loongarch_crc32c_intrinsics__march_loongarch64=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+CFLAGS="$pgac_save_CFLAGS"
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_loongarch_crc32c_intrinsics__march_loongarch64" >&5
+$as_echo "$pgac_cv_loongarch_crc32c_intrinsics__march_loongarch64" >&6; }
+if test x"$pgac_cv_loongarch_crc32c_intrinsics__march_loongarch64" = x"yes"; then
+  CFLAGS_CRC="-march=loongarch64"
+  pgac_loongarch_crc32c_intrinsics=yes
+fi
+
+fi
+
 
 
 # Select CRC-32C implementation.
@@ -18132,7 +18220,7 @@ fi
 #
 # You can override this logic by setting the appropriate USE_*_CRC32 flag to 1
 # in the template or configure command line.
-if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
+if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then
   # Use Intel SSE 4.2 if available.
   if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
     USE_SSE42_CRC32C=1
@@ -18150,10 +18238,15 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
         if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then
           USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1
         else
-          # fall back to slicing-by-8 algorithm, which doesn't require any
-          # special CPU support.
-          USE_SLICING_BY_8_CRC32C=1
-	fi
+          if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then
+            # LoongArch CRCC instructions.
+            USE_LOONGARCH_CRC32C=1
+          else
+            # fall back to slicing-by-8 algorithm, which doesn't require any
+            # special CPU support.
+            USE_SLICING_BY_8_CRC32C=1
+          fi
+        fi
       fi
     fi
   fi
@@ -18194,12 +18287,21 @@ $as_echo "#define USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h
         { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions with runtime check" >&5
 $as_echo "ARMv8 CRC instructions with runtime check" >&6; }
       else
+        if test x"$USE_LOONGARCH_CRC32C" = x"1"; then
+
+$as_echo "#define USE_LOONGARCH_CRC32C 1" >>confdefs.h
+
+          PG_CRC32C_OBJS="pg_crc32c_loongarch.o"
+          { $as_echo "$as_me:${as_lineno-$LINENO}: result: LoongArch CRCC instructions" >&5
+$as_echo "LoongArch CRCC instructions" >&6; }
+        else
 
 $as_echo "#define USE_SLICING_BY_8_CRC32C 1" >>confdefs.h
 
-        PG_CRC32C_OBJS="pg_crc32c_sb8.o"
-        { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5
+          PG_CRC32C_OBJS="pg_crc32c_sb8.o"
+          { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5
 $as_echo "slicing-by-8" >&6; }
+        fi
       fi
     fi
   fi
diff --git a/configure.ac b/configure.ac
index 09558ada0f..a312a33eaf 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2126,6 +2126,16 @@ if test x"$pgac_armv8_crc32c_intrinsics" != x"yes"; then
   PGAC_ARMV8_CRC32C_INTRINSICS([-march=armv8-a+crc])
 fi
 
+# Check for LoongArch CRC intrinsics to do CRC calculations.
+#
+# Check if __builtin_loongarch_crcc_* intrinsics can be used
+# with the default compiler flags.
+# CFLAGS_CRC is set if the extra flag is required.
+PGAC_LOONGARCH_CRC32C_INTRINSICS([])
+if test x"$pgac_loongarch_crc32c_intrinsics" != x"yes"; then
+  PGAC_LOONGARCH_CRC32C_INTRINSICS([-march=loongarch64])
+fi
+
 AC_SUBST(CFLAGS_CRC)
 
 # Select CRC-32C implementation.
@@ -2144,7 +2154,7 @@ AC_SUBST(CFLAGS_CRC)
 #
 # You can override this logic by setting the appropriate USE_*_CRC32 flag to 1
 # in the template or configure command line.
-if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
+if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then
   # Use Intel SSE 4.2 if available.
   if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
     USE_SSE42_CRC32C=1
@@ -2162,10 +2172,15 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
         if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then
           USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1
         else
-          # fall back to slicing-by-8 algorithm, which doesn't require any
-          # special CPU support.
-          USE_SLICING_BY_8_CRC32C=1
-	fi
+          if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then
+            # LoongArch CRCC instructions.
+            USE_LOONGARCH_CRC32C=1
+          else
+            # fall back to slicing-by-8 algorithm, which doesn't require any
+            # special CPU support.
+            USE_SLICING_BY_8_CRC32C=1
+          fi
+        fi
       fi
     fi
   fi
@@ -2193,9 +2208,15 @@ else
         PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o"
         AC_MSG_RESULT(ARMv8 CRC instructions with runtime check)
       else
-        AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).])
-        PG_CRC32C_OBJS="pg_crc32c_sb8.o"
-        AC_MSG_RESULT(slicing-by-8)
+        if test x"$USE_LOONGARCH_CRC32C" = x"1"; then
+          AC_DEFINE(USE_LOONGARCH_CRC32C, 1, [Define to 1 to use LoongArch CRCC instructions.])
+          PG_CRC32C_OBJS="pg_crc32c_loongarch.o"
+          AC_MSG_RESULT(LoongArch CRCC instructions)
+        else
+          AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).])
+          PG_CRC32C_OBJS="pg_crc32c_sb8.o"
+          AC_MSG_RESULT(slicing-by-8)
+        fi
       fi
     fi
   fi
diff --git a/meson.build b/meson.build
index 82f2782673..77a9b985df 100644
--- a/meson.build
+++ b/meson.build
@@ -2073,6 +2073,30 @@ int main(void)
     cdata.set('USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 1)
     have_optimized_crc = true
   endif
+
+elif host_cpu == 'loongarch64'
+
+  prog = '''
+int main(void)
+{
+    unsigned int crc = 0;
+    crc = __builtin_loongarch_crcc_w_b_w(0, crc);
+    crc = __builtin_loongarch_crcc_w_h_w(0, crc);
+    crc = __builtin_loongarch_crcc_w_w_w(0, crc);
+    crc = __builtin_loongarch_crcc_w_d_w(0, crc);
+
+    /* return computed value, to prevent the above being optimized away */
+    return crc == 0;
+}
+'''
+
+  if cc.links(prog, name: '__builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w, and __builtin_loongarch_crcc_w_d_w',
+      args: test_c_args)
+    # Use LoongArch CRC instruction unconditionally
+    cdata.set('USE_LOONGARCH_CRC32C', 1)
+    have_optimized_crc = true
+  endif
+
 endif
 
 if not have_optimized_crc
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 6d572c3820..1f253566e4 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -724,6 +724,9 @@
 /* Define to 1 to build with LLVM based JIT support. (--with-llvm) */
 #undef USE_LLVM
 
+/* Define to 1 to use LoongArch CRCC instructions. */
+#undef USE_LOONGARCH_CRC32C
+
 /* Define to 1 to build with LZ4 support. (--with-lz4) */
 #undef USE_LZ4
 
diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h
index 7f8779261c..d085f1dc00 100644
--- a/src/include/port/pg_crc32c.h
+++ b/src/include/port/pg_crc32c.h
@@ -58,6 +58,15 @@ extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t le
 
 extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len);
 
+#elif defined(USE_LOONGARCH_CRC32C)
+/* Use LoongArch CRCC instructions. */
+
+#define COMP_CRC32C(crc, data, len)							\
+	((crc) = pg_comp_crc32c_loongarch((crc), (data), (len)))
+#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
+
+extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len);
+
 #elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) || defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
 
 /*
diff --git a/src/port/Makefile b/src/port/Makefile
index 711f59e32b..1933b0e750 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -98,6 +98,11 @@ pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC)
 pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC)
 pg_crc32c_armv8_srv.o: CFLAGS+=$(CFLAGS_CRC)
 
+# all versions of pg_crc32c_loongarch.o need CFLAGS_CRC
+pg_crc32c_loongarch.o: CFLAGS+=$(CFLAGS_CRC)
+pg_crc32c_loongarch_shlib.o: CFLAGS+=$(CFLAGS_CRC)
+pg_crc32c_loongarch_srv.o: CFLAGS+=$(CFLAGS_CRC)
+
 #
 # Shared library versions of object files
 #
diff --git a/src/port/meson.build b/src/port/meson.build
index 24416b9bfc..3e77b2493a 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -92,6 +92,9 @@ replace_funcs_pos = [
   ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'],
   ['pg_crc32c_sb8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'],
 
+  # loongarch
+  ['pg_crc32c_loongarch', 'USE_LOONGARCH_CRC32C'],
+
   # generic fallback
   ['pg_crc32c_sb8', 'USE_SLICING_BY_8_CRC32C'],
 ]
diff --git a/src/port/pg_crc32c_loongarch.c b/src/port/pg_crc32c_loongarch.c
new file mode 100644
index 0000000000..b54fe12ff5
--- /dev/null
+++ b/src/port/pg_crc32c_loongarch.c
@@ -0,0 +1,72 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_crc32c_loongarch.c
+ *	  Compute CRC-32C checksum using LoongArch CRCC instructions
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_crc32c_loongarch.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "c.h"
+
+#include "port/pg_crc32c.h"
+
+pg_crc32c
+pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len)
+{
+	const unsigned char *p = data;
+	const unsigned char *pend = p + len;
+
+	/*
+	 * Aligned memory access is significantly faster.
+	 * Process leading bytes so that the loop below starts with a pointer aligned to eight bytes.
+	 */
+	if (!PointerIsAligned(p, uint16) &&
+		p + 1 <= pend)
+	{
+		crc = __builtin_loongarch_crcc_w_b_w(*p, crc);
+		p += 1;
+	}
+	if (!PointerIsAligned(p, uint32) &&
+		p + 2 <= pend)
+	{
+		crc = __builtin_loongarch_crcc_w_h_w(*(uint16 *) p, crc);
+		p += 2;
+	}
+	if (!PointerIsAligned(p, uint64) &&
+		p + 4 <= pend)
+	{
+		crc = __builtin_loongarch_crcc_w_w_w(*(uint32 *) p, crc);
+		p += 4;
+	}
+
+	/* Process eight bytes at a time, as far as we can. */
+	while (p + 8 <= pend)
+	{
+		crc = __builtin_loongarch_crcc_w_d_w(*(uint64 *) p, crc);
+		p += 8;
+	}
+
+	/* Process remaining 0-7 bytes. */
+	if (p + 4 <= pend)
+	{
+		crc = __builtin_loongarch_crcc_w_w_w(*(uint32 *) p, crc);
+		p += 4;
+	}
+	if (p + 2 <= pend)
+	{
+		crc = __builtin_loongarch_crcc_w_h_w(*(uint16 *) p, crc);
+		p += 2;
+	}
+	if (p < pend)
+	{
+		crc = __builtin_loongarch_crcc_w_b_w(*p, crc);
+	}
+
+	return crc;
+}
-- 
2.41.0

objdump.outtext/plain; charset=UTF-8; name=objdump.outDownload
test.outtext/plain; charset=UTF-8; name=test.outDownload
#4John Naylor
john.naylor@enterprisedb.com
In reply to: YANG Xudong (#3)
Re: [PATCH] Add loongarch native checksum implementation.

On Wed, Jun 14, 2023 at 9:20 AM YANG Xudong <yangxudong@ymatrix.cn> wrote:

Attached a new patch with fixes based on the comment below.

Note: It's helpful to pass "-v" to git format-patch, to have different
versions.

For x86 and Arm, if it fails to link without an -march flag, we allow
for a runtime check. The flags "-march=armv8-a+crc" and "-msse4.2" are
for instructions not found on all platforms. The patch also checks both
ways, and each one results in "Use LoongArch CRC instruction
unconditionally". The -march flag here is general, not specific. In
other words, if this only runs inside "+elif host_cpu == 'loongarch64'",
why do we need both with -march and without?

Removed the elif branch.

Okay, since we've confirmed that no arch flag is necessary, some other
places can be simplified:

--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -98,6 +98,11 @@ pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC)
 pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC)
 pg_crc32c_armv8_srv.o: CFLAGS+=$(CFLAGS_CRC)
+# all versions of pg_crc32c_loongarch.o need CFLAGS_CRC
+pg_crc32c_loongarch.o: CFLAGS+=$(CFLAGS_CRC)
+pg_crc32c_loongarch_shlib.o: CFLAGS+=$(CFLAGS_CRC)
+pg_crc32c_loongarch_srv.o: CFLAGS+=$(CFLAGS_CRC)

This was copy-and-pasted from platforms that use a runtime check, so should
be unnecessary.

+# If the intrinsics are supported, sets pgac_loongarch_crc32c_intrinsics,
+# and CFLAGS_CRC.
+# Check if __builtin_loongarch_crcc_* intrinsics can be used
+# with the default compiler flags.
+# CFLAGS_CRC is set if the extra flag is required.

Same here -- it seems we don't need to set CFLAGS_CRC at all. Can you
confirm?

Also, I don't have a Loongarch machine for testing. Could you show that
the instructions are found in the binary, maybe using objdump and grep?
Or a performance test?

The output of the objdump command `objdump -dS
../postgres-build/tmp_install/usr/local/pgsql/bin/postgres | grep -B 30
-A 10 crcc` is attached.

Thanks for confirming.

--
John Naylor
EDB: http://www.enterprisedb.com

#5YANG Xudong
yangxudong@ymatrix.cn
In reply to: John Naylor (#4)
1 attachment(s)
Re: [PATCH] Add loongarch native checksum implementation.

Updated the patch based on the comments.

On 2023/6/15 18:30, John Naylor wrote:

On Wed, Jun 14, 2023 at 9:20 AM YANG Xudong <yangxudong@ymatrix.cn
<mailto:yangxudong@ymatrix.cn>> wrote:

Attached a new patch with fixes based on the comment below.

Note: It's helpful to pass "-v" to git format-patch, to have different
versions.

Added v2

For x86 and Arm, if it fails to link without an -march flag, we allow
for a runtime check. The flags "-march=armv8-a+crc" and "-msse4.2" are
for instructions not found on all platforms. The patch also checks both
ways, and each one results in "Use LoongArch CRC instruction
unconditionally". The -march flag here is general, not specific. In
other words, if this only runs inside "+elif host_cpu ==

'loongarch64'",

why do we need both with -march and without?

Removed the elif branch.

Okay, since we've confirmed that no arch flag is necessary, some other
places can be simplified:

--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -98,6 +98,11 @@ pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC)
 pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC)
 pg_crc32c_armv8_srv.o: CFLAGS+=$(CFLAGS_CRC)
+# all versions of pg_crc32c_loongarch.o need CFLAGS_CRC
+pg_crc32c_loongarch.o: CFLAGS+=$(CFLAGS_CRC)
+pg_crc32c_loongarch_shlib.o: CFLAGS+=$(CFLAGS_CRC)
+pg_crc32c_loongarch_srv.o: CFLAGS+=$(CFLAGS_CRC)

This was copy-and-pasted from platforms that use a runtime check, so
should be unnecessary.

Removed these lines.

+# If the intrinsics are supported, sets pgac_loongarch_crc32c_intrinsics,
+# and CFLAGS_CRC.
+# Check if __builtin_loongarch_crcc_* intrinsics can be used
+# with the default compiler flags.
+# CFLAGS_CRC is set if the extra flag is required.

Same here -- it seems we don't need to set CFLAGS_CRC at all. Can you
confirm?

We don't need to set CFLAGS_CRC as commented. I have updated the
configure script to make it align with the logic in meson build script.

Show quoted text

Also, I don't have a Loongarch machine for testing. Could you show that
the instructions are found in the binary, maybe using objdump and grep?
Or a performance test?

The output of the objdump command `objdump -dS
../postgres-build/tmp_install/usr/local/pgsql/bin/postgres  | grep -B 30
-A 10 crcc` is attached.

Thanks for confirming.

--
John Naylor
EDB: http://www.enterprisedb.com <http://www.enterprisedb.com&gt;

Attachments:

v2-0001-Add-loongarch-native-checksum-implementation.patchtext/plain; charset=UTF-8; name=v2-0001-Add-loongarch-native-checksum-implementation.patchDownload
From b2329478b5331e2aa9942c7ee4e23e3bfa871c1d Mon Sep 17 00:00:00 2001
From: YANG Xudong <yangxudong@ymatrix.cn>
Date: Fri, 16 Jun 2023 09:22:08 +0800
Subject: [PATCH v2] Add loongarch native checksum implementation.

---
 config/c-compiler.m4           | 34 ++++++++++++++++
 configure                      | 73 ++++++++++++++++++++++++++++++----
 configure.ac                   | 33 +++++++++++----
 meson.build                    | 24 +++++++++++
 src/include/pg_config.h.in     |  3 ++
 src/include/port/pg_crc32c.h   |  9 +++++
 src/port/meson.build           |  3 ++
 src/port/pg_crc32c_loongarch.c | 72 +++++++++++++++++++++++++++++++++
 8 files changed, 236 insertions(+), 15 deletions(-)
 create mode 100644 src/port/pg_crc32c_loongarch.c

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 5be8f0f08d..eb3af009c4 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -661,3 +661,37 @@ if test x"$Ac_cachevar" = x"yes"; then
 fi
 undefine([Ac_cachevar])dnl
 ])# PGAC_ARMV8_CRC32C_INTRINSICS
+
+# PGAC_LOONGARCH_CRC32C_INTRINSICS
+# ---------------------------
+# Check if the compiler supports the LoongArch CRCC instructions, using
+# __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w,
+# __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w
+# intrinsic functions.
+#
+# If the intrinsics are supported, sets pgac_loongarch_crc32c_intrinsics,
+# and CFLAGS_CRC.
+AC_DEFUN([PGAC_LOONGARCH_CRC32C_INTRINSICS],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_loongarch_crc32c_intrinsics_$1])])dnl
+AC_CACHE_CHECK(
+  [for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w with CFLAGS=$1],
+  [Ac_cachevar],
+[pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS $1"
+AC_LINK_IFELSE([AC_LANG_PROGRAM([],
+  [unsigned int crc = 0;
+   crc = __builtin_loongarch_crcc_w_b_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_h_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_w_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_d_w(0, crc);
+   /* return computed value, to prevent the above being optimized away */
+   return crc == 0;])],
+  [Ac_cachevar=yes],
+  [Ac_cachevar=no])
+CFLAGS="$pgac_save_CFLAGS"])
+if test x"$Ac_cachevar" = x"yes"; then
+  CFLAGS_CRC="$1"
+  pgac_loongarch_crc32c_intrinsics=yes
+fi
+undefine([Ac_cachevar])dnl
+])# PGAC_LOONGARCH_CRC32C_INTRINSICS
diff --git a/configure b/configure
index 1b415142d1..7c60a26c11 100755
--- a/configure
+++ b/configure
@@ -18114,6 +18114,51 @@ fi
 
 fi
 
+# Check for LoongArch CRC intrinsics to do CRC calculations.
+#
+# Check if __builtin_loongarch_crcc_* intrinsics can be used
+# with the default compiler flags.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w with CFLAGS=" >&5
+$as_echo_n "checking for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w with CFLAGS=... " >&6; }
+if ${pgac_cv_loongarch_crc32c_intrinsics_+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS "
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+unsigned int crc = 0;
+   crc = __builtin_loongarch_crcc_w_b_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_h_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_w_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_d_w(0, crc);
+   /* return computed value, to prevent the above being optimized away */
+   return crc == 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_loongarch_crc32c_intrinsics_=yes
+else
+  pgac_cv_loongarch_crc32c_intrinsics_=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+CFLAGS="$pgac_save_CFLAGS"
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_loongarch_crc32c_intrinsics_" >&5
+$as_echo "$pgac_cv_loongarch_crc32c_intrinsics_" >&6; }
+if test x"$pgac_cv_loongarch_crc32c_intrinsics_" = x"yes"; then
+  CFLAGS_CRC=""
+  pgac_loongarch_crc32c_intrinsics=yes
+fi
+
+
 
 
 # Select CRC-32C implementation.
@@ -18132,7 +18177,7 @@ fi
 #
 # You can override this logic by setting the appropriate USE_*_CRC32 flag to 1
 # in the template or configure command line.
-if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
+if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then
   # Use Intel SSE 4.2 if available.
   if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
     USE_SSE42_CRC32C=1
@@ -18150,10 +18195,15 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
         if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then
           USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1
         else
-          # fall back to slicing-by-8 algorithm, which doesn't require any
-          # special CPU support.
-          USE_SLICING_BY_8_CRC32C=1
-	fi
+          if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then
+            # LoongArch CRCC instructions.
+            USE_LOONGARCH_CRC32C=1
+          else
+            # fall back to slicing-by-8 algorithm, which doesn't require any
+            # special CPU support.
+            USE_SLICING_BY_8_CRC32C=1
+          fi
+        fi
       fi
     fi
   fi
@@ -18194,12 +18244,21 @@ $as_echo "#define USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h
         { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions with runtime check" >&5
 $as_echo "ARMv8 CRC instructions with runtime check" >&6; }
       else
+        if test x"$USE_LOONGARCH_CRC32C" = x"1"; then
+
+$as_echo "#define USE_LOONGARCH_CRC32C 1" >>confdefs.h
+
+          PG_CRC32C_OBJS="pg_crc32c_loongarch.o"
+          { $as_echo "$as_me:${as_lineno-$LINENO}: result: LoongArch CRCC instructions" >&5
+$as_echo "LoongArch CRCC instructions" >&6; }
+        else
 
 $as_echo "#define USE_SLICING_BY_8_CRC32C 1" >>confdefs.h
 
-        PG_CRC32C_OBJS="pg_crc32c_sb8.o"
-        { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5
+          PG_CRC32C_OBJS="pg_crc32c_sb8.o"
+          { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5
 $as_echo "slicing-by-8" >&6; }
+        fi
       fi
     fi
   fi
diff --git a/configure.ac b/configure.ac
index 09558ada0f..d44e9f9ef8 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2126,6 +2126,12 @@ if test x"$pgac_armv8_crc32c_intrinsics" != x"yes"; then
   PGAC_ARMV8_CRC32C_INTRINSICS([-march=armv8-a+crc])
 fi
 
+# Check for LoongArch CRC intrinsics to do CRC calculations.
+#
+# Check if __builtin_loongarch_crcc_* intrinsics can be used
+# with the default compiler flags.
+PGAC_LOONGARCH_CRC32C_INTRINSICS([])
+
 AC_SUBST(CFLAGS_CRC)
 
 # Select CRC-32C implementation.
@@ -2144,7 +2150,7 @@ AC_SUBST(CFLAGS_CRC)
 #
 # You can override this logic by setting the appropriate USE_*_CRC32 flag to 1
 # in the template or configure command line.
-if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
+if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then
   # Use Intel SSE 4.2 if available.
   if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
     USE_SSE42_CRC32C=1
@@ -2162,10 +2168,15 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
         if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then
           USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1
         else
-          # fall back to slicing-by-8 algorithm, which doesn't require any
-          # special CPU support.
-          USE_SLICING_BY_8_CRC32C=1
-	fi
+          if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then
+            # LoongArch CRCC instructions.
+            USE_LOONGARCH_CRC32C=1
+          else
+            # fall back to slicing-by-8 algorithm, which doesn't require any
+            # special CPU support.
+            USE_SLICING_BY_8_CRC32C=1
+          fi
+        fi
       fi
     fi
   fi
@@ -2193,9 +2204,15 @@ else
         PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o"
         AC_MSG_RESULT(ARMv8 CRC instructions with runtime check)
       else
-        AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).])
-        PG_CRC32C_OBJS="pg_crc32c_sb8.o"
-        AC_MSG_RESULT(slicing-by-8)
+        if test x"$USE_LOONGARCH_CRC32C" = x"1"; then
+          AC_DEFINE(USE_LOONGARCH_CRC32C, 1, [Define to 1 to use LoongArch CRCC instructions.])
+          PG_CRC32C_OBJS="pg_crc32c_loongarch.o"
+          AC_MSG_RESULT(LoongArch CRCC instructions)
+        else
+          AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).])
+          PG_CRC32C_OBJS="pg_crc32c_sb8.o"
+          AC_MSG_RESULT(slicing-by-8)
+        fi
       fi
     fi
   fi
diff --git a/meson.build b/meson.build
index 82f2782673..77a9b985df 100644
--- a/meson.build
+++ b/meson.build
@@ -2073,6 +2073,30 @@ int main(void)
     cdata.set('USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 1)
     have_optimized_crc = true
   endif
+
+elif host_cpu == 'loongarch64'
+
+  prog = '''
+int main(void)
+{
+    unsigned int crc = 0;
+    crc = __builtin_loongarch_crcc_w_b_w(0, crc);
+    crc = __builtin_loongarch_crcc_w_h_w(0, crc);
+    crc = __builtin_loongarch_crcc_w_w_w(0, crc);
+    crc = __builtin_loongarch_crcc_w_d_w(0, crc);
+
+    /* return computed value, to prevent the above being optimized away */
+    return crc == 0;
+}
+'''
+
+  if cc.links(prog, name: '__builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w, and __builtin_loongarch_crcc_w_d_w',
+      args: test_c_args)
+    # Use LoongArch CRC instruction unconditionally
+    cdata.set('USE_LOONGARCH_CRC32C', 1)
+    have_optimized_crc = true
+  endif
+
 endif
 
 if not have_optimized_crc
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 6d572c3820..1f253566e4 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -724,6 +724,9 @@
 /* Define to 1 to build with LLVM based JIT support. (--with-llvm) */
 #undef USE_LLVM
 
+/* Define to 1 to use LoongArch CRCC instructions. */
+#undef USE_LOONGARCH_CRC32C
+
 /* Define to 1 to build with LZ4 support. (--with-lz4) */
 #undef USE_LZ4
 
diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h
index 7f8779261c..d085f1dc00 100644
--- a/src/include/port/pg_crc32c.h
+++ b/src/include/port/pg_crc32c.h
@@ -58,6 +58,15 @@ extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t le
 
 extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len);
 
+#elif defined(USE_LOONGARCH_CRC32C)
+/* Use LoongArch CRCC instructions. */
+
+#define COMP_CRC32C(crc, data, len)							\
+	((crc) = pg_comp_crc32c_loongarch((crc), (data), (len)))
+#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
+
+extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len);
+
 #elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) || defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
 
 /*
diff --git a/src/port/meson.build b/src/port/meson.build
index 24416b9bfc..3e77b2493a 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -92,6 +92,9 @@ replace_funcs_pos = [
   ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'],
   ['pg_crc32c_sb8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'],
 
+  # loongarch
+  ['pg_crc32c_loongarch', 'USE_LOONGARCH_CRC32C'],
+
   # generic fallback
   ['pg_crc32c_sb8', 'USE_SLICING_BY_8_CRC32C'],
 ]
diff --git a/src/port/pg_crc32c_loongarch.c b/src/port/pg_crc32c_loongarch.c
new file mode 100644
index 0000000000..b54fe12ff5
--- /dev/null
+++ b/src/port/pg_crc32c_loongarch.c
@@ -0,0 +1,72 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_crc32c_loongarch.c
+ *	  Compute CRC-32C checksum using LoongArch CRCC instructions
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_crc32c_loongarch.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "c.h"
+
+#include "port/pg_crc32c.h"
+
+pg_crc32c
+pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len)
+{
+	const unsigned char *p = data;
+	const unsigned char *pend = p + len;
+
+	/*
+	 * Aligned memory access is significantly faster.
+	 * Process leading bytes so that the loop below starts with a pointer aligned to eight bytes.
+	 */
+	if (!PointerIsAligned(p, uint16) &&
+		p + 1 <= pend)
+	{
+		crc = __builtin_loongarch_crcc_w_b_w(*p, crc);
+		p += 1;
+	}
+	if (!PointerIsAligned(p, uint32) &&
+		p + 2 <= pend)
+	{
+		crc = __builtin_loongarch_crcc_w_h_w(*(uint16 *) p, crc);
+		p += 2;
+	}
+	if (!PointerIsAligned(p, uint64) &&
+		p + 4 <= pend)
+	{
+		crc = __builtin_loongarch_crcc_w_w_w(*(uint32 *) p, crc);
+		p += 4;
+	}
+
+	/* Process eight bytes at a time, as far as we can. */
+	while (p + 8 <= pend)
+	{
+		crc = __builtin_loongarch_crcc_w_d_w(*(uint64 *) p, crc);
+		p += 8;
+	}
+
+	/* Process remaining 0-7 bytes. */
+	if (p + 4 <= pend)
+	{
+		crc = __builtin_loongarch_crcc_w_w_w(*(uint32 *) p, crc);
+		p += 4;
+	}
+	if (p + 2 <= pend)
+	{
+		crc = __builtin_loongarch_crcc_w_h_w(*(uint16 *) p, crc);
+		p += 2;
+	}
+	if (p < pend)
+	{
+		crc = __builtin_loongarch_crcc_w_b_w(*p, crc);
+	}
+
+	return crc;
+}
-- 
2.41.0

#6YANG Xudong
yangxudong@ymatrix.cn
In reply to: YANG Xudong (#5)
Re: [PATCH] Add loongarch native checksum implementation.

Is there any other comment?

If the patch looks OK, I would like to update its status to ready for
committer in the commitfest.

Thanks!

Show quoted text

On 2023/6/16 09:28, YANG Xudong wrote:

Updated the patch based on the comments.

On 2023/6/15 18:30, John Naylor wrote:

On Wed, Jun 14, 2023 at 9:20 AM YANG Xudong <yangxudong@ymatrix.cn
<mailto:yangxudong@ymatrix.cn>> wrote:
 >
 > Attached a new patch with fixes based on the comment below.

Note: It's helpful to pass "-v" to git format-patch, to have different
versions.

Added v2

 > > For x86 and Arm, if it fails to link without an -march flag, we
allow
 > > for a runtime check. The flags "-march=armv8-a+crc" and
"-msse4.2" are
 > > for instructions not found on all platforms. The patch also
checks both
 > > ways, and each one results in "Use LoongArch CRC instruction
 > > unconditionally". The -march flag here is general, not specific. In
 > > other words, if this only runs inside "+elif host_cpu ==
'loongarch64'",
 > > why do we need both with -march and without?
 > >
 >
 > Removed the elif branch.

Okay, since we've confirmed that no arch flag is necessary, some other
places can be simplified:

--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -98,6 +98,11 @@ pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC)
  pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC)
  pg_crc32c_armv8_srv.o: CFLAGS+=$(CFLAGS_CRC)
+# all versions of pg_crc32c_loongarch.o need CFLAGS_CRC
+pg_crc32c_loongarch.o: CFLAGS+=$(CFLAGS_CRC)
+pg_crc32c_loongarch_shlib.o: CFLAGS+=$(CFLAGS_CRC)
+pg_crc32c_loongarch_srv.o: CFLAGS+=$(CFLAGS_CRC)

This was copy-and-pasted from platforms that use a runtime check, so
should be unnecessary.

Removed these lines.

+# If the intrinsics are supported, sets 
pgac_loongarch_crc32c_intrinsics,
+# and CFLAGS_CRC.
+# Check if __builtin_loongarch_crcc_* intrinsics can be used
+# with the default compiler flags.
+# CFLAGS_CRC is set if the extra flag is required.

Same here -- it seems we don't need to set CFLAGS_CRC at all. Can you
confirm?

We don't need to set CFLAGS_CRC as commented. I have updated the
configure script to make it align with the logic in meson build script.

 > > Also, I don't have a Loongarch machine for testing. Could you
show that
 > > the instructions are found in the binary, maybe using objdump and
grep?
 > > Or a performance test?
 > >
 >
 > The output of the objdump command `objdump -dS
 > ../postgres-build/tmp_install/usr/local/pgsql/bin/postgres  | grep
-B 30
 > -A 10 crcc` is attached.

Thanks for confirming.

--
John Naylor
EDB: http://www.enterprisedb.com <http://www.enterprisedb.com&gt;

#7John Naylor
john.naylor@enterprisedb.com
In reply to: YANG Xudong (#6)
Re: [PATCH] Add loongarch native checksum implementation.

On Wed, Jul 5, 2023 at 9:16 AM YANG Xudong <yangxudong@ymatrix.cn> wrote:

Is there any other comment?

It's only been a few weeks since the last patch, and this is not an urgent
bugfix, so there is no reason to ping the thread. Feature freeze will
likely be in April of next year.

Also, please don't top-post (which means: quoting an entire message, with
new text at the top) -- it clutters our archives.

Before I look at this again: Are there any objections to another CRC
implementation for the reason of having no buildfarm member?

--
John Naylor
EDB: http://www.enterprisedb.com

#8huchangqi
huchangqi@loongson.cn
In reply to: YANG Xudong (#6)
Re: Re: [PATCH] Add loongarch native checksum implementation.

Hi, i have a loongarch machine runing Loongnix-server (based on redhat/centos, it has gcc-8.3 on it), i am trying to test buildfarm on it, when i edit build-farm.conf it seems to need animal and secret to connect the buildfarm server.
then i go to https://buildfarm.postgresql.org/cgi-bin/register-form.pl, and registered the buildfarm a week ago, but didn't receive any response. so what else i need to do next.

-----Original Messages-----
From: "YANG Xudong" <yangxudong@ymatrix.cn>
Send time:Wednesday, 07/05/2023 10:15:51
To: "John Naylor" <john.naylor@enterprisedb.com>
Cc: pgsql-hackers@lists.postgresql.org, wengyanqing@ymatrix.cn, wanghao@ymatrix.cn
Subject: Re: [PATCH] Add loongarch native checksum implementation.

Is there any other comment?

If the patch looks OK, I would like to update its status to ready for
committer in the commitfest.

Thanks!

On 2023/6/16 09:28, YANG Xudong wrote:

Updated the patch based on the comments.

On 2023/6/15 18:30, John Naylor wrote:

On Wed, Jun 14, 2023 at 9:20 AM YANG Xudong <yangxudong@ymatrix.cn
<mailto:yangxudong@ymatrix.cn>> wrote:
 >
 > Attached a new patch with fixes based on the comment below.

Note: It's helpful to pass "-v" to git format-patch, to have different
versions.

Added v2

 > > For x86 and Arm, if it fails to link without an -march flag, we
allow
 > > for a runtime check. The flags "-march=armv8-a+crc" and
"-msse4.2" are
 > > for instructions not found on all platforms. The patch also
checks both
 > > ways, and each one results in "Use LoongArch CRC instruction
 > > unconditionally". The -march flag here is general, not specific. In
 > > other words, if this only runs inside "+elif host_cpu ==
'loongarch64'",
 > > why do we need both with -march and without?
 > >
 >
 > Removed the elif branch.

Okay, since we've confirmed that no arch flag is necessary, some other
places can be simplified:

--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -98,6 +98,11 @@ pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC)
  pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC)
  pg_crc32c_armv8_srv.o: CFLAGS+=$(CFLAGS_CRC)
+# all versions of pg_crc32c_loongarch.o need CFLAGS_CRC
+pg_crc32c_loongarch.o: CFLAGS+=$(CFLAGS_CRC)
+pg_crc32c_loongarch_shlib.o: CFLAGS+=$(CFLAGS_CRC)
+pg_crc32c_loongarch_srv.o: CFLAGS+=$(CFLAGS_CRC)

This was copy-and-pasted from platforms that use a runtime check, so
should be unnecessary.

Removed these lines.

+# If the intrinsics are supported, sets 
pgac_loongarch_crc32c_intrinsics,
+# and CFLAGS_CRC.
+# Check if __builtin_loongarch_crcc_* intrinsics can be used
+# with the default compiler flags.
+# CFLAGS_CRC is set if the extra flag is required.

Same here -- it seems we don't need to set CFLAGS_CRC at all. Can you
confirm?

We don't need to set CFLAGS_CRC as commented. I have updated the
configure script to make it align with the logic in meson build script.

 > > Also, I don't have a Loongarch machine for testing. Could you
show that
 > > the instructions are found in the binary, maybe using objdump and
grep?
 > > Or a performance test?
 > >
 >
 > The output of the objdump command `objdump -dS
 > ../postgres-build/tmp_install/usr/local/pgsql/bin/postgres  | grep
-B 30
 > -A 10 crcc` is attached.

Thanks for confirming.

--
John Naylor
EDB: http://www.enterprisedb.com <http://www.enterprisedb.com&gt;

本邮件及其附件含有龙芯中科的商业秘密信息,仅限于发送给上面地址中列出的个人或群组。禁止任何其他人以任何形式使用(包括但不限于全部或部分地泄露、复制或散发)本邮件及其附件中的信息。如果您错收本邮件,请您立即电话或邮件通知发件人并删除本邮件。
This email and its attachments contain confidential information from Loongson Technology , which is intended only for the person or entity whose address is listed above. Any use of the information contained herein in any way (including, but not limited to, total or partial disclosure, reproduction or dissemination) by persons other than the intended recipient(s) is prohibited. If you receive this email in error, please notify the sender by phone or email immediately and delete it.

#9Daniel Gustafsson
daniel@yesql.se
In reply to: huchangqi (#8)
Re: [PATCH] Add loongarch native checksum implementation.

On 6 Jul 2023, at 09:14, huchangqi <huchangqi@loongson.cn> wrote:

Hi, i have a loongarch machine runing Loongnix-server (based on redhat/centos, it has gcc-8.3 on it), i am trying to test buildfarm on it, when i edit build-farm.conf it seems to need animal and secret to connect the buildfarm server.
then i go to https://buildfarm.postgresql.org/cgi-bin/register-form.pl, and registered the buildfarm a week ago, but didn't receive any response. so what else i need to do next.

Thanks for volunteering a buildfarm animal! The registration is probably just
pending due to it being summer and things slow down, but I've added Andrew
Dunstan who is the Buildfarm expert on CC:.

--
Daniel Gustafsson

#10YANG Xudong
yangxudong@ymatrix.cn
In reply to: huchangqi (#8)
Re: [PATCH] Add loongarch native checksum implementation.

On 2023/7/6 15:14, huchangqi wrote:

Hi, i have a loongarch machine runing Loongnix-server (based on redhat/centos, it has gcc-8.3 on it), i am trying to test buildfarm on it, when i edit build-farm.conf it seems to need animal and secret to connect the buildfarm server.
then i go to https://buildfarm.postgresql.org/cgi-bin/register-form.pl, and registered the buildfarm a week ago, but didn't receive any response. so what else i need to do next.

Is it possible to provide a build farm instance for new world ABI of
loongarch also by loongson? It will be really appreciated.

Thanks!

Show quoted text

-----Original Messages-----
From: "YANG Xudong" <yangxudong@ymatrix.cn>
Send time:Wednesday, 07/05/2023 10:15:51
To: "John Naylor" <john.naylor@enterprisedb.com>
Cc: pgsql-hackers@lists.postgresql.org, wengyanqing@ymatrix.cn, wanghao@ymatrix.cn
Subject: Re: [PATCH] Add loongarch native checksum implementation.

Is there any other comment?

If the patch looks OK, I would like to update its status to ready for
committer in the commitfest.

Thanks!

On 2023/6/16 09:28, YANG Xudong wrote:

Updated the patch based on the comments.

On 2023/6/15 18:30, John Naylor wrote:

On Wed, Jun 14, 2023 at 9:20 AM YANG Xudong <yangxudong@ymatrix.cn
<mailto:yangxudong@ymatrix.cn>> wrote:
 >
 > Attached a new patch with fixes based on the comment below.

Note: It's helpful to pass "-v" to git format-patch, to have different
versions.

Added v2

 > > For x86 and Arm, if it fails to link without an -march flag, we
allow
 > > for a runtime check. The flags "-march=armv8-a+crc" and
"-msse4.2" are
 > > for instructions not found on all platforms. The patch also
checks both
 > > ways, and each one results in "Use LoongArch CRC instruction
 > > unconditionally". The -march flag here is general, not specific. In
 > > other words, if this only runs inside "+elif host_cpu ==
'loongarch64'",
 > > why do we need both with -march and without?
 > >
 >
 > Removed the elif branch.

Okay, since we've confirmed that no arch flag is necessary, some other
places can be simplified:

--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -98,6 +98,11 @@ pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC)
  pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC)
  pg_crc32c_armv8_srv.o: CFLAGS+=$(CFLAGS_CRC)
+# all versions of pg_crc32c_loongarch.o need CFLAGS_CRC
+pg_crc32c_loongarch.o: CFLAGS+=$(CFLAGS_CRC)
+pg_crc32c_loongarch_shlib.o: CFLAGS+=$(CFLAGS_CRC)
+pg_crc32c_loongarch_srv.o: CFLAGS+=$(CFLAGS_CRC)

This was copy-and-pasted from platforms that use a runtime check, so
should be unnecessary.

Removed these lines.

+# If the intrinsics are supported, sets
pgac_loongarch_crc32c_intrinsics,
+# and CFLAGS_CRC.
+# Check if __builtin_loongarch_crcc_* intrinsics can be used
+# with the default compiler flags.
+# CFLAGS_CRC is set if the extra flag is required.

Same here -- it seems we don't need to set CFLAGS_CRC at all. Can you
confirm?

We don't need to set CFLAGS_CRC as commented. I have updated the
configure script to make it align with the logic in meson build script.

 > > Also, I don't have a Loongarch machine for testing. Could you
show that
 > > the instructions are found in the binary, maybe using objdump and
grep?
 > > Or a performance test?
 > >
 >
 > The output of the objdump command `objdump -dS
 > ../postgres-build/tmp_install/usr/local/pgsql/bin/postgres  | grep
-B 30
 > -A 10 crcc` is attached.

Thanks for confirming.

--
John Naylor
EDB: http://www.enterprisedb.com <http://www.enterprisedb.com&gt;

本邮件及其附件含有龙芯中科的商业秘密信息,仅限于发送给上面地址中列出的个人或群组。禁止任何其他人以任何形式使用(包括但不限于全部或部分地泄露、复制或散发)本邮件及其附件中的信息。如果您错收本邮件,请您立即电话或邮件通知发件人并删除本邮件。
This email and its attachments contain confidential information from Loongson Technology , which is intended only for the person or entity whose address is listed above. Any use of the information contained herein in any way (including, but not limited to, total or partial disclosure, reproduction or dissemination) by persons other than the intended recipient(s) is prohibited. If you receive this email in error, please notify the sender by phone or email immediately and delete it.

#11YANG Xudong
yangxudong@ymatrix.cn
In reply to: YANG Xudong (#10)
Re: [PATCH] Add loongarch native checksum implementation.

Many thanks to huchangqi. Now we have loongarch64 support for both old
world ABI and new world ABI on the buildfarm!

-------- Forwarded Message --------
Subject: Re: [PATCH] Add loongarch native checksum implementation.
Date: Tue, 25 Jul 2023 15:51:43 +0800
From: huchangqi <huchangqi@loongson.cn>
To: YANG Xudong <yangxudong@ymatrix.cn>

Both cisticola and nuthatch are on the buildfarm now。

cisticola is "old world ABI".
https://buildfarm.postgresql.org/cgi-bin/show_history.pl?nm=cisticola&amp;br=HEAD

nuthatch is "new world ABI".
https://buildfarm.postgresql.org/cgi-bin/show_history.pl?nm=nuthatch&amp;br=HEAD

----------
Best regards,
huchangqi

On 2023/7/5 15:11, John Naylor wrote:

Before I look at this again: Are there any objections to another CRC
implementation for the reason of having no buildfarm member?

It is possible to try this patch on buildfarm now, I guess?

#12Michael Paquier
michael@paquier.xyz
In reply to: John Naylor (#7)
Re: [PATCH] Add loongarch native checksum implementation.

On Wed, Jul 05, 2023 at 02:11:02PM +0700, John Naylor wrote:

Also, please don't top-post (which means: quoting an entire message, with
new text at the top) -- it clutters our archives.

Before I look at this again: Are there any objections to another CRC
implementation for the reason of having no buildfarm member?

The performance numbers presented upthread for the CRC computations
are kind of nice in this environment, but honestly I have no idea how
much this architecture is used. Perhaps that's only something in
China? I am not seeing much activity around that in Japan, for
instance, and that's really close.

Anyway, based on today's state of the buildfarm, we have a buildfarm
member named cisticola that should be able to test this new CRC
implementation, so I see no problem in applying this stuff now if you
think it is in good shape.
--
Michael

#13Nathan Bossart
nathandbossart@gmail.com
In reply to: Michael Paquier (#12)
Re: [PATCH] Add loongarch native checksum implementation.

On Wed, Jul 26, 2023 at 12:16:28PM +0900, Michael Paquier wrote:

On Wed, Jul 05, 2023 at 02:11:02PM +0700, John Naylor wrote:

Before I look at this again: Are there any objections to another CRC
implementation for the reason of having no buildfarm member?

[ ... ]

Anyway, based on today's state of the buildfarm, we have a buildfarm
member named cisticola that should be able to test this new CRC
implementation, so I see no problem in applying this stuff now if you
think it is in good shape.

IMHO we should strive to maintain buildfarm coverage for all the
instrinsics used within Postgres, if for no other reason than to ensure
future changes do not break those platforms.

--
Nathan Bossart
Amazon Web Services: https://aws.amazon.com

#14YANG Xudong
yangxudong@ymatrix.cn
In reply to: Michael Paquier (#12)
Re: [PATCH] Add loongarch native checksum implementation.

On 2023/7/26 11:16, Michael Paquier wrote> The performance numbers
presented upthread for the CRC computations

are kind of nice in this environment, but honestly I have no idea how
much this architecture is used. Perhaps that's only something in
China? I am not seeing much activity around that in Japan, for
instance, and that's really close.

The architecture is pretty new (to open source ecosystem). The support
of it in Linux kernel and GCC were released last year.

Here is a site about the status of major open source project support of
it. (Chinese only)

https://areweloongyet.com/

#15John Naylor
john.naylor@enterprisedb.com
In reply to: Michael Paquier (#12)
Re: [PATCH] Add loongarch native checksum implementation.

On Wed, Jul 26, 2023 at 8:25 AM YANG Xudong <yangxudong@ymatrix.cn> wrote:

Many thanks to huchangqi. Now we have loongarch64 support for both old
world ABI and new world ABI on the buildfarm!

Glad to hear it!

On Wed, Jul 26, 2023 at 10:16 AM Michael Paquier <michael@paquier.xyz>
wrote:

The performance numbers presented upthread for the CRC computations
are kind of nice in this environment, but honestly I have no idea how
much this architecture is used. Perhaps that's only something in
China? I am not seeing much activity around that in Japan, for
instance, and that's really close.

That was my impression as well. My thinking was, we can give the same
treatment that we gave Arm a number of years ago (which is now quite
mainstream).

Anyway, based on today's state of the buildfarm, we have a buildfarm
member named cisticola that should be able to test this new CRC
implementation, so I see no problem in applying this stuff now if you
think it is in good shape.

I believe there was just a comment that needed updating, so I'll do that
and push within a few days.

--
John Naylor
EDB: http://www.enterprisedb.com

#16John Naylor
john.naylor@enterprisedb.com
In reply to: YANG Xudong (#5)
Re: [PATCH] Add loongarch native checksum implementation.

On Fri, Jun 16, 2023 at 8:28 AM YANG Xudong <yangxudong@ymatrix.cn> wrote:

+# If the intrinsics are supported, sets

pgac_loongarch_crc32c_intrinsics,

+# and CFLAGS_CRC.

+# Check if __builtin_loongarch_crcc_* intrinsics can be used
+# with the default compiler flags.
+# CFLAGS_CRC is set if the extra flag is required.

Same here -- it seems we don't need to set CFLAGS_CRC at all. Can you
confirm?

We don't need to set CFLAGS_CRC as commented. I have updated the
configure script to make it align with the logic in meson build script.

(Looking again at v2)

The compilation test is found in c-compiler.m4, which still has all logic
for CFLAGS_CRC, including saving and restoring the old CFLAGS. Can this
also be simplified?

I diff'd pg_crc32c_loongarch.c with the current other files, and found it
is structurally the same as the Arm implementation. That's logical if
memory alignment is important.

  /*
- * ARMv8 doesn't require alignment, but aligned memory access is
- * significantly faster. Process leading bytes so that the loop below
- * starts with a pointer aligned to eight bytes.
+ * Aligned memory access is significantly faster.
+ * Process leading bytes so that the loop below starts with a pointer
aligned to eight bytes.

Can you confirm the alignment requirement -- it's not clear what the
intention is since "doesn't require" wasn't carried over. Is there any
documentation (or even a report in some other context) about aligned vs
unaligned memory access performance?

--
John Naylor
EDB: http://www.enterprisedb.com

#17YANG Xudong
yangxudong@ymatrix.cn
In reply to: John Naylor (#16)
1 attachment(s)
Re: [PATCH] Add loongarch native checksum implementation.

Thanks for the comment. I have updated the patch to v3. Please have a look.

On 2023/8/7 19:01, John Naylor wrote:

On Fri, Jun 16, 2023 at 8:28 AM YANG Xudong <yangxudong@ymatrix.cn
<mailto:yangxudong@ymatrix.cn>> wrote:

+# If the intrinsics are supported, sets

pgac_loongarch_crc32c_intrinsics,

+# and CFLAGS_CRC.

+# Check if __builtin_loongarch_crcc_* intrinsics can be used
+# with the default compiler flags.
+# CFLAGS_CRC is set if the extra flag is required.

Same here -- it seems we don't need to set CFLAGS_CRC at all. Can you
confirm?

We don't need to set CFLAGS_CRC as commented. I have updated the
configure script to make it align with the logic in meson build script.

(Looking again at v2)

The compilation test is found in c-compiler.m4, which still has all
logic for CFLAGS_CRC, including saving and restoring the old CFLAGS. Can
this also be simplified?

Fixed the function in c-compiler.m4 by removing the function argument
and the logic of handling CFLAGS and CFLAGS_CRC.

I diff'd pg_crc32c_loongarch.c with the current other files, and found
it is structurally the same as the Arm implementation. That's logical if
memory alignment is important.

  /*
- * ARMv8 doesn't require alignment, but aligned memory access is
- * significantly faster. Process leading bytes so that the loop below
- * starts with a pointer aligned to eight bytes.
+ * Aligned memory access is significantly faster.
+ * Process leading bytes so that the loop below starts with a pointer 
aligned to eight bytes.

Can you confirm the alignment requirement -- it's not clear what the
intention is since "doesn't require" wasn't carried over. Is there any
documentation (or even a report in some other context) about aligned vs
unaligned memory access performance?

It is in the official document that the alignment is not required.

https://github.com/loongson/la-softdev-convention/blob/master/la-softdev-convention.adoc#74-unaligned-memory-access-support

However, I found this patch in LKML that shows great performance gain
when using aligned memory access similar to this patch.

https://lore.kernel.org/lkml/20230410115734.93365-1-wangrui@loongson.cn/

So I guess using aligned memory access is necessary and I have updated
the comment in the code.

Show quoted text

--
John Naylor
EDB: http://www.enterprisedb.com <http://www.enterprisedb.com&gt;

Attachments:

v3-0001-Add-loongarch-native-checksum-implementation.patchtext/plain; charset=UTF-8; name=v3-0001-Add-loongarch-native-checksum-implementation.patchDownload
From ced3f65d7445bdcca0628c4f5073b5657a81cd28 Mon Sep 17 00:00:00 2001
From: YANG Xudong <yangxudong@ymatrix.cn>
Date: Tue, 8 Aug 2023 10:41:58 +0800
Subject: [PATCH v3] Add loongarch native checksum implementation.

---
 config/c-compiler.m4           | 29 ++++++++++++++
 configure                      | 69 ++++++++++++++++++++++++++++----
 configure.ac                   | 33 +++++++++++----
 meson.build                    | 24 +++++++++++
 src/include/pg_config.h.in     |  3 ++
 src/include/port/pg_crc32c.h   |  9 +++++
 src/port/meson.build           |  3 ++
 src/port/pg_crc32c_loongarch.c | 73 ++++++++++++++++++++++++++++++++++
 8 files changed, 228 insertions(+), 15 deletions(-)
 create mode 100644 src/port/pg_crc32c_loongarch.c

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 5be8f0f08d..7777ad6e90 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -661,3 +661,32 @@ if test x"$Ac_cachevar" = x"yes"; then
 fi
 undefine([Ac_cachevar])dnl
 ])# PGAC_ARMV8_CRC32C_INTRINSICS
+
+# PGAC_LOONGARCH_CRC32C_INTRINSICS
+# ---------------------------
+# Check if the compiler supports the LoongArch CRCC instructions, using
+# __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w,
+# __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w
+# intrinsic functions.
+#
+# If the intrinsics are supported, sets pgac_loongarch_crc32c_intrinsics.
+AC_DEFUN([PGAC_LOONGARCH_CRC32C_INTRINSICS],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_loongarch_crc32c_intrinsics])])dnl
+AC_CACHE_CHECK(
+  [for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w],
+  [Ac_cachevar],
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([],
+  [unsigned int crc = 0;
+   crc = __builtin_loongarch_crcc_w_b_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_h_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_w_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_d_w(0, crc);
+   /* return computed value, to prevent the above being optimized away */
+   return crc == 0;])],
+  [Ac_cachevar=yes],
+  [Ac_cachevar=no])])
+if test x"$Ac_cachevar" = x"yes"; then
+  pgac_loongarch_crc32c_intrinsics=yes
+fi
+undefine([Ac_cachevar])dnl
+])# PGAC_LOONGARCH_CRC32C_INTRINSICS
diff --git a/configure b/configure
index 963fbbcf1e..c5eb2814f1 100755
--- a/configure
+++ b/configure
@@ -18047,6 +18047,47 @@ fi
 
 fi
 
+# Check for LoongArch CRC intrinsics to do CRC calculations.
+#
+# Check if __builtin_loongarch_crcc_* intrinsics can be used
+# with the default compiler flags.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w" >&5
+$as_echo_n "checking for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w... " >&6; }
+if ${pgac_cv_loongarch_crc32c_intrinsics+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+unsigned int crc = 0;
+   crc = __builtin_loongarch_crcc_w_b_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_h_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_w_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_d_w(0, crc);
+   /* return computed value, to prevent the above being optimized away */
+   return crc == 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_loongarch_crc32c_intrinsics=yes
+else
+  pgac_cv_loongarch_crc32c_intrinsics=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_loongarch_crc32c_intrinsics" >&5
+$as_echo "$pgac_cv_loongarch_crc32c_intrinsics" >&6; }
+if test x"$pgac_cv_loongarch_crc32c_intrinsics" = x"yes"; then
+  pgac_loongarch_crc32c_intrinsics=yes
+fi
+
+
 
 
 # Select CRC-32C implementation.
@@ -18065,7 +18106,7 @@ fi
 #
 # You can override this logic by setting the appropriate USE_*_CRC32 flag to 1
 # in the template or configure command line.
-if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
+if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then
   # Use Intel SSE 4.2 if available.
   if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
     USE_SSE42_CRC32C=1
@@ -18083,10 +18124,15 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
         if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then
           USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1
         else
-          # fall back to slicing-by-8 algorithm, which doesn't require any
-          # special CPU support.
-          USE_SLICING_BY_8_CRC32C=1
-	fi
+          if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then
+            # LoongArch CRCC instructions.
+            USE_LOONGARCH_CRC32C=1
+          else
+            # fall back to slicing-by-8 algorithm, which doesn't require any
+            # special CPU support.
+            USE_SLICING_BY_8_CRC32C=1
+          fi
+        fi
       fi
     fi
   fi
@@ -18127,12 +18173,21 @@ $as_echo "#define USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h
         { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions with runtime check" >&5
 $as_echo "ARMv8 CRC instructions with runtime check" >&6; }
       else
+        if test x"$USE_LOONGARCH_CRC32C" = x"1"; then
+
+$as_echo "#define USE_LOONGARCH_CRC32C 1" >>confdefs.h
+
+          PG_CRC32C_OBJS="pg_crc32c_loongarch.o"
+          { $as_echo "$as_me:${as_lineno-$LINENO}: result: LoongArch CRCC instructions" >&5
+$as_echo "LoongArch CRCC instructions" >&6; }
+        else
 
 $as_echo "#define USE_SLICING_BY_8_CRC32C 1" >>confdefs.h
 
-        PG_CRC32C_OBJS="pg_crc32c_sb8.o"
-        { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5
+          PG_CRC32C_OBJS="pg_crc32c_sb8.o"
+          { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5
 $as_echo "slicing-by-8" >&6; }
+        fi
       fi
     fi
   fi
diff --git a/configure.ac b/configure.ac
index 5153b8b3fd..3011b8ea34 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2099,6 +2099,12 @@ if test x"$pgac_armv8_crc32c_intrinsics" != x"yes"; then
   PGAC_ARMV8_CRC32C_INTRINSICS([-march=armv8-a+crc])
 fi
 
+# Check for LoongArch CRC intrinsics to do CRC calculations.
+#
+# Check if __builtin_loongarch_crcc_* intrinsics can be used
+# with the default compiler flags.
+PGAC_LOONGARCH_CRC32C_INTRINSICS()
+
 AC_SUBST(CFLAGS_CRC)
 
 # Select CRC-32C implementation.
@@ -2117,7 +2123,7 @@ AC_SUBST(CFLAGS_CRC)
 #
 # You can override this logic by setting the appropriate USE_*_CRC32 flag to 1
 # in the template or configure command line.
-if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
+if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then
   # Use Intel SSE 4.2 if available.
   if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
     USE_SSE42_CRC32C=1
@@ -2135,10 +2141,15 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
         if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then
           USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1
         else
-          # fall back to slicing-by-8 algorithm, which doesn't require any
-          # special CPU support.
-          USE_SLICING_BY_8_CRC32C=1
-	fi
+          if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then
+            # LoongArch CRCC instructions.
+            USE_LOONGARCH_CRC32C=1
+          else
+            # fall back to slicing-by-8 algorithm, which doesn't require any
+            # special CPU support.
+            USE_SLICING_BY_8_CRC32C=1
+          fi
+        fi
       fi
     fi
   fi
@@ -2166,9 +2177,15 @@ else
         PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o"
         AC_MSG_RESULT(ARMv8 CRC instructions with runtime check)
       else
-        AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).])
-        PG_CRC32C_OBJS="pg_crc32c_sb8.o"
-        AC_MSG_RESULT(slicing-by-8)
+        if test x"$USE_LOONGARCH_CRC32C" = x"1"; then
+          AC_DEFINE(USE_LOONGARCH_CRC32C, 1, [Define to 1 to use LoongArch CRCC instructions.])
+          PG_CRC32C_OBJS="pg_crc32c_loongarch.o"
+          AC_MSG_RESULT(LoongArch CRCC instructions)
+        else
+          AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).])
+          PG_CRC32C_OBJS="pg_crc32c_sb8.o"
+          AC_MSG_RESULT(slicing-by-8)
+        fi
       fi
     fi
   fi
diff --git a/meson.build b/meson.build
index 0a11efc97a..2acb204003 100644
--- a/meson.build
+++ b/meson.build
@@ -2065,6 +2065,30 @@ int main(void)
     cdata.set('USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 1)
     have_optimized_crc = true
   endif
+
+elif host_cpu == 'loongarch64'
+
+  prog = '''
+int main(void)
+{
+    unsigned int crc = 0;
+    crc = __builtin_loongarch_crcc_w_b_w(0, crc);
+    crc = __builtin_loongarch_crcc_w_h_w(0, crc);
+    crc = __builtin_loongarch_crcc_w_w_w(0, crc);
+    crc = __builtin_loongarch_crcc_w_d_w(0, crc);
+
+    /* return computed value, to prevent the above being optimized away */
+    return crc == 0;
+}
+'''
+
+  if cc.links(prog, name: '__builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w, and __builtin_loongarch_crcc_w_d_w',
+      args: test_c_args)
+    # Use LoongArch CRC instruction unconditionally
+    cdata.set('USE_LOONGARCH_CRC32C', 1)
+    have_optimized_crc = true
+  endif
+
 endif
 
 if not have_optimized_crc
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index ee209d6d70..d8a2985567 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -714,6 +714,9 @@
 /* Define to 1 to build with LLVM based JIT support. (--with-llvm) */
 #undef USE_LLVM
 
+/* Define to 1 to use LoongArch CRCC instructions. */
+#undef USE_LOONGARCH_CRC32C
+
 /* Define to 1 to build with LZ4 support. (--with-lz4) */
 #undef USE_LZ4
 
diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h
index 7f8779261c..d085f1dc00 100644
--- a/src/include/port/pg_crc32c.h
+++ b/src/include/port/pg_crc32c.h
@@ -58,6 +58,15 @@ extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t le
 
 extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len);
 
+#elif defined(USE_LOONGARCH_CRC32C)
+/* Use LoongArch CRCC instructions. */
+
+#define COMP_CRC32C(crc, data, len)							\
+	((crc) = pg_comp_crc32c_loongarch((crc), (data), (len)))
+#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
+
+extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len);
+
 #elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) || defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
 
 /*
diff --git a/src/port/meson.build b/src/port/meson.build
index 9d0cd93c43..deb354418d 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -92,6 +92,9 @@ replace_funcs_pos = [
   ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'],
   ['pg_crc32c_sb8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'],
 
+  # loongarch
+  ['pg_crc32c_loongarch', 'USE_LOONGARCH_CRC32C'],
+
   # generic fallback
   ['pg_crc32c_sb8', 'USE_SLICING_BY_8_CRC32C'],
 ]
diff --git a/src/port/pg_crc32c_loongarch.c b/src/port/pg_crc32c_loongarch.c
new file mode 100644
index 0000000000..2897920800
--- /dev/null
+++ b/src/port/pg_crc32c_loongarch.c
@@ -0,0 +1,73 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_crc32c_loongarch.c
+ *	  Compute CRC-32C checksum using LoongArch CRCC instructions
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_crc32c_loongarch.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "c.h"
+
+#include "port/pg_crc32c.h"
+
+pg_crc32c
+pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len)
+{
+	const unsigned char *p = data;
+	const unsigned char *pend = p + len;
+
+	/*
+	 * Loongarch desktop and server chips support unaligned memory access by default.
+	 * However, aligned memory access is significantly faster.
+	 * Process leading bytes so that the loop below starts with a pointer aligned to eight bytes.
+	 */
+	if (!PointerIsAligned(p, uint16) &&
+		p + 1 <= pend)
+	{
+		crc = __builtin_loongarch_crcc_w_b_w(*p, crc);
+		p += 1;
+	}
+	if (!PointerIsAligned(p, uint32) &&
+		p + 2 <= pend)
+	{
+		crc = __builtin_loongarch_crcc_w_h_w(*(uint16 *) p, crc);
+		p += 2;
+	}
+	if (!PointerIsAligned(p, uint64) &&
+		p + 4 <= pend)
+	{
+		crc = __builtin_loongarch_crcc_w_w_w(*(uint32 *) p, crc);
+		p += 4;
+	}
+
+	/* Process eight bytes at a time, as far as we can. */
+	while (p + 8 <= pend)
+	{
+		crc = __builtin_loongarch_crcc_w_d_w(*(uint64 *) p, crc);
+		p += 8;
+	}
+
+	/* Process remaining 0-7 bytes. */
+	if (p + 4 <= pend)
+	{
+		crc = __builtin_loongarch_crcc_w_w_w(*(uint32 *) p, crc);
+		p += 4;
+	}
+	if (p + 2 <= pend)
+	{
+		crc = __builtin_loongarch_crcc_w_h_w(*(uint16 *) p, crc);
+		p += 2;
+	}
+	if (p < pend)
+	{
+		crc = __builtin_loongarch_crcc_w_b_w(*p, crc);
+	}
+
+	return crc;
+}
-- 
2.41.0

#18John Naylor
john.naylor@enterprisedb.com
In reply to: YANG Xudong (#17)
2 attachment(s)
Re: [PATCH] Add loongarch native checksum implementation.

On Tue, Aug 8, 2023 at 10:07 AM YANG Xudong <yangxudong@ymatrix.cn> wrote:

On 2023/8/7 19:01, John Naylor wrote:

The compilation test is found in c-compiler.m4, which still has all
logic for CFLAGS_CRC, including saving and restoring the old CFLAGS. Can
this also be simplified?

Fixed the function in c-compiler.m4 by removing the function argument
and the logic of handling CFLAGS and CFLAGS_CRC.

Looks good to me. It seems that platforms capable of running Postgres only
support 64 bit. If that ever changes, the compiler intrinsic test (with 8
byte CRC input) should still gate that well enough in autoconf, I believe,
so in v4 I added a comment to clarify this. The Meson build checks hostcpu
first for all platforms, and the patch is consistent with surrounding code.
In the attached 0002 addendum, I change a comment in configure.ac to
clarify "override" is referring to the runtime check for x86 and Arm, and
that LoongArch doesn't need one.

Can you confirm the alignment requirement -- it's not clear what the
intention is since "doesn't require" wasn't carried over. Is there any
documentation (or even a report in some other context) about aligned vs
unaligned memory access performance?

It is in the official document that the alignment is not required.

https://github.com/loongson/la-softdev-convention/blob/master/la-softdev-convention.adoc#74-unaligned-memory-access-support

However, I found this patch in LKML that shows great performance gain
when using aligned memory access similar to this patch.

https://lore.kernel.org/lkml/20230410115734.93365-1-wangrui@loongson.cn/

So I guess using aligned memory access is necessary and I have updated
the comment in the code.

Okay, so it's not "necessary" in the sense that it's illegal, so I'm
thinking we can just re-use the Arm comment language, as in 0002.

v4 0001 is the same as v3, but with a draft commit message. I will squash
and commit this week, unless there is additional feedback.

--
John Naylor
EDB: http://www.enterprisedb.com

Attachments:

v4-0002-Some-minor-adjustemts-to-be-squashed.patchtext/x-patch; charset=US-ASCII; name=v4-0002-Some-minor-adjustemts-to-be-squashed.patchDownload
From 561893beb4e3e008196b3e571685503e25a243f1 Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@postgresql.org>
Date: Tue, 8 Aug 2023 12:58:07 +0700
Subject: [PATCH v4 2/2] Some minor adjustemts to be squashed

---
 config/c-compiler.m4           | 4 ++++
 configure                      | 7 +++++--
 configure.ac                   | 7 +++++--
 src/port/pg_crc32c_loongarch.c | 6 +++---
 4 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 7777ad6e90..bd3e6d6623 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -669,6 +669,10 @@ undefine([Ac_cachevar])dnl
 # __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w
 # intrinsic functions.
 #
+# We test for the 8-byte variant since platforms capable of running
+# Postgres are 64-bit only (as of PG17), so we know CRC instructions
+# are available without a runtime check.
+#
 # If the intrinsics are supported, sets pgac_loongarch_crc32c_intrinsics.
 AC_DEFUN([PGAC_LOONGARCH_CRC32C_INTRINSICS],
 [define([Ac_cachevar], [AS_TR_SH([pgac_cv_loongarch_crc32c_intrinsics])])dnl
diff --git a/configure b/configure
index fe0b02aa80..6a80e374f1 100755
--- a/configure
+++ b/configure
@@ -18119,8 +18119,11 @@ fi
 # we're not targeting such a processor, but can nevertheless produce code that
 # uses the CRC instructions, compile both, and select at runtime.
 #
-# You can override this logic by setting the appropriate USE_*_CRC32 flag to 1
+# You can skip the runtime check by setting the appropriate USE_*_CRC32 flag to 1
 # in the template or configure command line.
+#
+# If we are targeting a LoongArch processor, CRC instructions are
+# always available (at least on 64 bit), so no runtime check is needed.
 if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then
   # Use Intel SSE 4.2 if available.
   if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
@@ -18139,8 +18142,8 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
         if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then
           USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1
         else
+          # LoongArch CRCC instructions.
           if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then
-            # LoongArch CRCC instructions.
             USE_LOONGARCH_CRC32C=1
           else
             # fall back to slicing-by-8 algorithm, which doesn't require any
diff --git a/configure.ac b/configure.ac
index 57f0f836c7..6105af6996 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2130,8 +2130,11 @@ AC_SUBST(CFLAGS_CRC)
 # we're not targeting such a processor, but can nevertheless produce code that
 # uses the CRC instructions, compile both, and select at runtime.
 #
-# You can override this logic by setting the appropriate USE_*_CRC32 flag to 1
+# You can skip the runtime check by setting the appropriate USE_*_CRC32 flag to 1
 # in the template or configure command line.
+#
+# If we are targeting a LoongArch processor, CRC instructions are
+# always available (at least on 64 bit), so no runtime check is needed.
 if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then
   # Use Intel SSE 4.2 if available.
   if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
@@ -2150,8 +2153,8 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
         if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then
           USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1
         else
+          # LoongArch CRCC instructions.
           if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then
-            # LoongArch CRCC instructions.
             USE_LOONGARCH_CRC32C=1
           else
             # fall back to slicing-by-8 algorithm, which doesn't require any
diff --git a/src/port/pg_crc32c_loongarch.c b/src/port/pg_crc32c_loongarch.c
index 2897920800..db9da80e1b 100644
--- a/src/port/pg_crc32c_loongarch.c
+++ b/src/port/pg_crc32c_loongarch.c
@@ -23,9 +23,9 @@ pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len)
 	const unsigned char *pend = p + len;
 
 	/*
-	 * Loongarch desktop and server chips support unaligned memory access by default.
-	 * However, aligned memory access is significantly faster.
-	 * Process leading bytes so that the loop below starts with a pointer aligned to eight bytes.
+	 * LoongArch doesn't require alignment, but aligned memory access is
+	 * significantly faster. Process leading bytes so that the loop below
+	 * starts with a pointer aligned to eight bytes.
 	 */
 	if (!PointerIsAligned(p, uint16) &&
 		p + 1 <= pend)
-- 
2.41.0

v4-0001-Use-native-CRC-instructions-on-LoongArch.patchtext/x-patch; charset=US-ASCII; name=v4-0001-Use-native-CRC-instructions-on-LoongArch.patchDownload
From 63666c12ee7533728b88a5028c28ea1e7c12a19a Mon Sep 17 00:00:00 2001
From: YANG Xudong <yangxudong@ymatrix.cn>
Date: Tue, 8 Aug 2023 10:41:58 +0800
Subject: [PATCH v4 1/2] Use native CRC instructions on LoongArch

As with the Intel and Arm CRC instructions, compiler intrinsics
for them must be supported by the compiler. In contrast, no runtime
check is needed. Aligned memory access is faster, so use the coding
in pg_crc32c_armv8.c as the model.

YANG Xudong

Discussion: https://postgr.es/m/b522a0c5-e3b2-99cc-6387-58134fb88cbe%40ymatrix.cn
---
 config/c-compiler.m4           | 29 ++++++++++++++
 configure                      | 69 ++++++++++++++++++++++++++++----
 configure.ac                   | 33 +++++++++++----
 meson.build                    | 24 +++++++++++
 src/include/pg_config.h.in     |  3 ++
 src/include/port/pg_crc32c.h   |  9 +++++
 src/port/meson.build           |  3 ++
 src/port/pg_crc32c_loongarch.c | 73 ++++++++++++++++++++++++++++++++++
 8 files changed, 228 insertions(+), 15 deletions(-)
 create mode 100644 src/port/pg_crc32c_loongarch.c

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 5be8f0f08d..7777ad6e90 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -661,3 +661,32 @@ if test x"$Ac_cachevar" = x"yes"; then
 fi
 undefine([Ac_cachevar])dnl
 ])# PGAC_ARMV8_CRC32C_INTRINSICS
+
+# PGAC_LOONGARCH_CRC32C_INTRINSICS
+# ---------------------------
+# Check if the compiler supports the LoongArch CRCC instructions, using
+# __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w,
+# __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w
+# intrinsic functions.
+#
+# If the intrinsics are supported, sets pgac_loongarch_crc32c_intrinsics.
+AC_DEFUN([PGAC_LOONGARCH_CRC32C_INTRINSICS],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_loongarch_crc32c_intrinsics])])dnl
+AC_CACHE_CHECK(
+  [for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w],
+  [Ac_cachevar],
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([],
+  [unsigned int crc = 0;
+   crc = __builtin_loongarch_crcc_w_b_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_h_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_w_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_d_w(0, crc);
+   /* return computed value, to prevent the above being optimized away */
+   return crc == 0;])],
+  [Ac_cachevar=yes],
+  [Ac_cachevar=no])])
+if test x"$Ac_cachevar" = x"yes"; then
+  pgac_loongarch_crc32c_intrinsics=yes
+fi
+undefine([Ac_cachevar])dnl
+])# PGAC_LOONGARCH_CRC32C_INTRINSICS
diff --git a/configure b/configure
index 2e518c8007..fe0b02aa80 100755
--- a/configure
+++ b/configure
@@ -18062,6 +18062,47 @@ fi
 
 fi
 
+# Check for LoongArch CRC intrinsics to do CRC calculations.
+#
+# Check if __builtin_loongarch_crcc_* intrinsics can be used
+# with the default compiler flags.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w" >&5
+$as_echo_n "checking for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w... " >&6; }
+if ${pgac_cv_loongarch_crc32c_intrinsics+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+unsigned int crc = 0;
+   crc = __builtin_loongarch_crcc_w_b_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_h_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_w_w(0, crc);
+   crc = __builtin_loongarch_crcc_w_d_w(0, crc);
+   /* return computed value, to prevent the above being optimized away */
+   return crc == 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_loongarch_crc32c_intrinsics=yes
+else
+  pgac_cv_loongarch_crc32c_intrinsics=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_loongarch_crc32c_intrinsics" >&5
+$as_echo "$pgac_cv_loongarch_crc32c_intrinsics" >&6; }
+if test x"$pgac_cv_loongarch_crc32c_intrinsics" = x"yes"; then
+  pgac_loongarch_crc32c_intrinsics=yes
+fi
+
+
 
 
 # Select CRC-32C implementation.
@@ -18080,7 +18121,7 @@ fi
 #
 # You can override this logic by setting the appropriate USE_*_CRC32 flag to 1
 # in the template or configure command line.
-if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
+if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then
   # Use Intel SSE 4.2 if available.
   if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
     USE_SSE42_CRC32C=1
@@ -18098,10 +18139,15 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
         if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then
           USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1
         else
-          # fall back to slicing-by-8 algorithm, which doesn't require any
-          # special CPU support.
-          USE_SLICING_BY_8_CRC32C=1
-	fi
+          if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then
+            # LoongArch CRCC instructions.
+            USE_LOONGARCH_CRC32C=1
+          else
+            # fall back to slicing-by-8 algorithm, which doesn't require any
+            # special CPU support.
+            USE_SLICING_BY_8_CRC32C=1
+          fi
+        fi
       fi
     fi
   fi
@@ -18142,12 +18188,21 @@ $as_echo "#define USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h
         { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions with runtime check" >&5
 $as_echo "ARMv8 CRC instructions with runtime check" >&6; }
       else
+        if test x"$USE_LOONGARCH_CRC32C" = x"1"; then
+
+$as_echo "#define USE_LOONGARCH_CRC32C 1" >>confdefs.h
+
+          PG_CRC32C_OBJS="pg_crc32c_loongarch.o"
+          { $as_echo "$as_me:${as_lineno-$LINENO}: result: LoongArch CRCC instructions" >&5
+$as_echo "LoongArch CRCC instructions" >&6; }
+        else
 
 $as_echo "#define USE_SLICING_BY_8_CRC32C 1" >>confdefs.h
 
-        PG_CRC32C_OBJS="pg_crc32c_sb8.o"
-        { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5
+          PG_CRC32C_OBJS="pg_crc32c_sb8.o"
+          { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5
 $as_echo "slicing-by-8" >&6; }
+        fi
       fi
     fi
   fi
diff --git a/configure.ac b/configure.ac
index 3ebe1a796d..57f0f836c7 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2108,6 +2108,12 @@ if test x"$pgac_armv8_crc32c_intrinsics" != x"yes"; then
   PGAC_ARMV8_CRC32C_INTRINSICS([-march=armv8-a+crc])
 fi
 
+# Check for LoongArch CRC intrinsics to do CRC calculations.
+#
+# Check if __builtin_loongarch_crcc_* intrinsics can be used
+# with the default compiler flags.
+PGAC_LOONGARCH_CRC32C_INTRINSICS()
+
 AC_SUBST(CFLAGS_CRC)
 
 # Select CRC-32C implementation.
@@ -2126,7 +2132,7 @@ AC_SUBST(CFLAGS_CRC)
 #
 # You can override this logic by setting the appropriate USE_*_CRC32 flag to 1
 # in the template or configure command line.
-if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then
+if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then
   # Use Intel SSE 4.2 if available.
   if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
     USE_SSE42_CRC32C=1
@@ -2144,10 +2150,15 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" &&
         if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then
           USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1
         else
-          # fall back to slicing-by-8 algorithm, which doesn't require any
-          # special CPU support.
-          USE_SLICING_BY_8_CRC32C=1
-	fi
+          if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then
+            # LoongArch CRCC instructions.
+            USE_LOONGARCH_CRC32C=1
+          else
+            # fall back to slicing-by-8 algorithm, which doesn't require any
+            # special CPU support.
+            USE_SLICING_BY_8_CRC32C=1
+          fi
+        fi
       fi
     fi
   fi
@@ -2175,9 +2186,15 @@ else
         PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o"
         AC_MSG_RESULT(ARMv8 CRC instructions with runtime check)
       else
-        AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).])
-        PG_CRC32C_OBJS="pg_crc32c_sb8.o"
-        AC_MSG_RESULT(slicing-by-8)
+        if test x"$USE_LOONGARCH_CRC32C" = x"1"; then
+          AC_DEFINE(USE_LOONGARCH_CRC32C, 1, [Define to 1 to use LoongArch CRCC instructions.])
+          PG_CRC32C_OBJS="pg_crc32c_loongarch.o"
+          AC_MSG_RESULT(LoongArch CRCC instructions)
+        else
+          AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).])
+          PG_CRC32C_OBJS="pg_crc32c_sb8.o"
+          AC_MSG_RESULT(slicing-by-8)
+        fi
       fi
     fi
   fi
diff --git a/meson.build b/meson.build
index 04ea348852..51582e143d 100644
--- a/meson.build
+++ b/meson.build
@@ -2073,6 +2073,30 @@ int main(void)
     cdata.set('USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 1)
     have_optimized_crc = true
   endif
+
+elif host_cpu == 'loongarch64'
+
+  prog = '''
+int main(void)
+{
+    unsigned int crc = 0;
+    crc = __builtin_loongarch_crcc_w_b_w(0, crc);
+    crc = __builtin_loongarch_crcc_w_h_w(0, crc);
+    crc = __builtin_loongarch_crcc_w_w_w(0, crc);
+    crc = __builtin_loongarch_crcc_w_d_w(0, crc);
+
+    /* return computed value, to prevent the above being optimized away */
+    return crc == 0;
+}
+'''
+
+  if cc.links(prog, name: '__builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w, and __builtin_loongarch_crcc_w_d_w',
+      args: test_c_args)
+    # Use LoongArch CRC instruction unconditionally
+    cdata.set('USE_LOONGARCH_CRC32C', 1)
+    have_optimized_crc = true
+  endif
+
 endif
 
 if not have_optimized_crc
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index ee209d6d70..d8a2985567 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -714,6 +714,9 @@
 /* Define to 1 to build with LLVM based JIT support. (--with-llvm) */
 #undef USE_LLVM
 
+/* Define to 1 to use LoongArch CRCC instructions. */
+#undef USE_LOONGARCH_CRC32C
+
 /* Define to 1 to build with LZ4 support. (--with-lz4) */
 #undef USE_LZ4
 
diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h
index 7f8779261c..d085f1dc00 100644
--- a/src/include/port/pg_crc32c.h
+++ b/src/include/port/pg_crc32c.h
@@ -58,6 +58,15 @@ extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t le
 
 extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len);
 
+#elif defined(USE_LOONGARCH_CRC32C)
+/* Use LoongArch CRCC instructions. */
+
+#define COMP_CRC32C(crc, data, len)							\
+	((crc) = pg_comp_crc32c_loongarch((crc), (data), (len)))
+#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
+
+extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len);
+
 #elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) || defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
 
 /*
diff --git a/src/port/meson.build b/src/port/meson.build
index 9d0cd93c43..deb354418d 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -92,6 +92,9 @@ replace_funcs_pos = [
   ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'],
   ['pg_crc32c_sb8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'],
 
+  # loongarch
+  ['pg_crc32c_loongarch', 'USE_LOONGARCH_CRC32C'],
+
   # generic fallback
   ['pg_crc32c_sb8', 'USE_SLICING_BY_8_CRC32C'],
 ]
diff --git a/src/port/pg_crc32c_loongarch.c b/src/port/pg_crc32c_loongarch.c
new file mode 100644
index 0000000000..2897920800
--- /dev/null
+++ b/src/port/pg_crc32c_loongarch.c
@@ -0,0 +1,73 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_crc32c_loongarch.c
+ *	  Compute CRC-32C checksum using LoongArch CRCC instructions
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_crc32c_loongarch.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "c.h"
+
+#include "port/pg_crc32c.h"
+
+pg_crc32c
+pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len)
+{
+	const unsigned char *p = data;
+	const unsigned char *pend = p + len;
+
+	/*
+	 * Loongarch desktop and server chips support unaligned memory access by default.
+	 * However, aligned memory access is significantly faster.
+	 * Process leading bytes so that the loop below starts with a pointer aligned to eight bytes.
+	 */
+	if (!PointerIsAligned(p, uint16) &&
+		p + 1 <= pend)
+	{
+		crc = __builtin_loongarch_crcc_w_b_w(*p, crc);
+		p += 1;
+	}
+	if (!PointerIsAligned(p, uint32) &&
+		p + 2 <= pend)
+	{
+		crc = __builtin_loongarch_crcc_w_h_w(*(uint16 *) p, crc);
+		p += 2;
+	}
+	if (!PointerIsAligned(p, uint64) &&
+		p + 4 <= pend)
+	{
+		crc = __builtin_loongarch_crcc_w_w_w(*(uint32 *) p, crc);
+		p += 4;
+	}
+
+	/* Process eight bytes at a time, as far as we can. */
+	while (p + 8 <= pend)
+	{
+		crc = __builtin_loongarch_crcc_w_d_w(*(uint64 *) p, crc);
+		p += 8;
+	}
+
+	/* Process remaining 0-7 bytes. */
+	if (p + 4 <= pend)
+	{
+		crc = __builtin_loongarch_crcc_w_w_w(*(uint32 *) p, crc);
+		p += 4;
+	}
+	if (p + 2 <= pend)
+	{
+		crc = __builtin_loongarch_crcc_w_h_w(*(uint16 *) p, crc);
+		p += 2;
+	}
+	if (p < pend)
+	{
+		crc = __builtin_loongarch_crcc_w_b_w(*p, crc);
+	}
+
+	return crc;
+}
-- 
2.41.0

#19YANG Xudong
yangxudong@ymatrix.cn
In reply to: John Naylor (#18)
Re: [PATCH] Add loongarch native checksum implementation.

On 2023/8/8 14:38, John Naylor wrote:

It seems that platforms capable of running Postgres
only support 64 bit.

I think so.

So I guess using aligned memory access is necessary and I have updated
the comment in the code.

Okay, so it's not "necessary" in the sense that it's illegal, so I'm
thinking we can just re-use the Arm comment language, as in 0002.

Yes. I think it is similar to Arm.

v4 0001 is the same as v3, but with a draft commit message. I will
squash and commit this week, unless there is additional feedback.

Looks good to me. Thanks for the additional patch.

#20John Naylor
john.naylor@enterprisedb.com
In reply to: YANG Xudong (#19)
Re: [PATCH] Add loongarch native checksum implementation.

On Tue, Aug 8, 2023 at 2:22 PM YANG Xudong <yangxudong@ymatrix.cn> wrote:

On 2023/8/8 14:38, John Naylor wrote:

v4 0001 is the same as v3, but with a draft commit message. I will
squash and commit this week, unless there is additional feedback.

Looks good to me. Thanks for the additional patch.

I pushed this with another small comment change. Unfortunately, I didn't
glance at the buildfarm beforehand -- it seems many members are failing an
isolation check added by commit fa2e87494, including both loongarch64
members. I'll check back periodically.

https://buildfarm.postgresql.org/cgi-bin/show_status.pl

--
John Naylor
EDB: http://www.enterprisedb.com

#21Amit Kapila
amit.kapila16@gmail.com
In reply to: John Naylor (#20)
Re: [PATCH] Add loongarch native checksum implementation.

On Thu, Aug 10, 2023 at 10:35 AM John Naylor
<john.naylor@enterprisedb.com> wrote:

On Tue, Aug 8, 2023 at 2:22 PM YANG Xudong <yangxudong@ymatrix.cn> wrote:

On 2023/8/8 14:38, John Naylor wrote:

v4 0001 is the same as v3, but with a draft commit message. I will
squash and commit this week, unless there is additional feedback.

Looks good to me. Thanks for the additional patch.

I pushed this with another small comment change. Unfortunately, I didn't glance at the buildfarm beforehand -- it seems many members are failing an isolation check added by commit fa2e87494, including both loongarch64 members. I'll check back periodically.

https://buildfarm.postgresql.org/cgi-bin/show_status.pl

In MSVC build, on doing: perl mkvcbuild.pl after this commit, I am
facing the below error:
Generating configuration headers...
undefined symbol: USE_LOONGARCH_CRC32C at src/include/pg_config.h line
718 at ../postgresql/src/tools/msvc/Mkvcbuild.pm line 872.

Am I missing something or did the commit miss something?

--
With Regards,
Amit Kapila.

#22Michael Paquier
michael@paquier.xyz
In reply to: Amit Kapila (#21)
Re: [PATCH] Add loongarch native checksum implementation.

On Thu, Aug 10, 2023 at 03:56:37PM +0530, Amit Kapila wrote:

In MSVC build, on doing: perl mkvcbuild.pl after this commit, I am
facing the below error:
Generating configuration headers...
undefined symbol: USE_LOONGARCH_CRC32C at src/include/pg_config.h line
718 at ../postgresql/src/tools/msvc/Mkvcbuild.pm line 872.

Am I missing something or did the commit miss something?

Yes, the commit has missed the addition of USE_LOONGARCH_CRC32C in
Solution.pm. If you want to be consistent with pg_config.h.in, you
could add it just after USE_LLVM, for instance.
--
Michael

#23John Naylor
john.naylor@enterprisedb.com
In reply to: Michael Paquier (#22)
Re: [PATCH] Add loongarch native checksum implementation.

On Thu, Aug 10, 2023 at 5:54 PM Michael Paquier <michael@paquier.xyz> wrote:

On Thu, Aug 10, 2023 at 03:56:37PM +0530, Amit Kapila wrote:

In MSVC build, on doing: perl mkvcbuild.pl after this commit, I am
facing the below error:
Generating configuration headers...
undefined symbol: USE_LOONGARCH_CRC32C at src/include/pg_config.h line
718 at ../postgresql/src/tools/msvc/Mkvcbuild.pm line 872.

Am I missing something or did the commit miss something?

Yes, the commit has missed the addition of USE_LOONGARCH_CRC32C in
Solution.pm. If you want to be consistent with pg_config.h.in, you
could add it just after USE_LLVM, for instance.

Oops, fixing now...

--
John Naylor
EDB: http://www.enterprisedb.com

#24Amit Kapila
amit.kapila16@gmail.com
In reply to: John Naylor (#23)
Re: [PATCH] Add loongarch native checksum implementation.

On Thu, Aug 10, 2023 at 5:07 PM John Naylor
<john.naylor@enterprisedb.com> wrote:

On Thu, Aug 10, 2023 at 5:54 PM Michael Paquier <michael@paquier.xyz> wrote:

On Thu, Aug 10, 2023 at 03:56:37PM +0530, Amit Kapila wrote:

In MSVC build, on doing: perl mkvcbuild.pl after this commit, I am
facing the below error:
Generating configuration headers...
undefined symbol: USE_LOONGARCH_CRC32C at src/include/pg_config.h line
718 at ../postgresql/src/tools/msvc/Mkvcbuild.pm line 872.

Am I missing something or did the commit miss something?

Yes, the commit has missed the addition of USE_LOONGARCH_CRC32C in
Solution.pm. If you want to be consistent with pg_config.h.in, you
could add it just after USE_LLVM, for instance.

Oops, fixing now...

It is fixed now. Thanks!

--
With Regards,
Amit Kapila.