From b72327ad1cc2b35767cab6a8267892878c868f56 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Thu, 1 Aug 2024 16:38:05 +1200
Subject: [PATCH 2/2] Standardize macros for detecting architectures.

Instead of repeating compilers' architecture macros throughout the tree
and sometimes getting it wrong, let's detect them in one central place,
and define our own macros of the form:

  PG_ARCH_{ARM,LOONGARCH,MIPS,PPC,RISCV,S390,SPARC,X86}
  PG_ARCH_{ARM,LOONGARCH,MIPS,PPC,RISCV,S390,SPARC,X86}_{32,64}

This fixes the problem that MSVC builds were unintentionally using
pessimistic fallback code defined by "port/atomics.h", due to
inconsistent testing for architecture macros.  A couple of other obscure
places were also affected, but failing to include arch-x86.h on Windows
seems pretty egregious:

 * pg_{read,write}_barrier() must be falling back to pg_memory_barrier()
   instead of pg_compiler_barrier()
 * pg_spin_delay() must be falling back to nothing at all
 * PG_HAVE_8BYTE_SINGLE_COPY_ATOMICITY must not be defined

Discussion: https://postgr.es/m/CA%2BhUKGKAf_i6w7hB_3pqZXQeqn%2BixvY%2BCMps_n%3DmJ5HAatMjMw%40mail.gmail.com
---
 contrib/pgcrypto/crypt-blowfish.c    |  4 +-
 src/include/c.h                      | 57 ++++++++++++++++++++++++++++
 src/include/jit/llvmjit_backport.h   |  2 +-
 src/include/port/atomics.h           |  6 +--
 src/include/port/atomics/arch-arm.h  |  4 +-
 src/include/port/atomics/arch-x86.h  | 16 ++++----
 src/include/port/pg_bitutils.h       |  4 +-
 src/include/portability/instr_time.h |  2 +-
 src/include/storage/s_lock.h         | 16 ++++----
 src/port/pg_crc32c_armv8_choose.c    | 12 +++---
 src/port/pg_crc32c_sse42.c           |  4 +-
 11 files changed, 92 insertions(+), 35 deletions(-)

diff --git a/contrib/pgcrypto/crypt-blowfish.c b/contrib/pgcrypto/crypt-blowfish.c
index 5a1b1e10091..9c4e02e428b 100644
--- a/contrib/pgcrypto/crypt-blowfish.c
+++ b/contrib/pgcrypto/crypt-blowfish.c
@@ -38,10 +38,10 @@
 #include "px-crypt.h"
 #include "px.h"
 
-#ifdef __i386__
+#if defined(PG_ARCH_X86_32)
 #define BF_ASM				0	/* 1 */
 #define BF_SCALE			1
-#elif defined(__x86_64__)
+#elif defined(PG_ARCH_X86_64)
 #define BF_ASM				0
 #define BF_SCALE			1
 #else
diff --git a/src/include/c.h b/src/include/c.h
index 88d13ec9993..f9872ea20c7 100644
--- a/src/include/c.h
+++ b/src/include/c.h
@@ -585,6 +585,63 @@ typedef void (*pg_funcptr_t) (void);
 #define HAVE_PRAGMA_GCC_SYSTEM_HEADER	1
 #endif
 
+/*
+ * Project-standardized name for CPU architectures, to avoid having to repeat
+ * the names that different compilers use.
+ */
+#if defined(__arm__) || defined(__arm)
+#define PG_ARCH_ARM_32
+#elif defined(__aarch64__) || defined(_M_ARM64)
+#define PG_ARCH_ARM_64
+#elif defined(__loongarch) && !defined(__loongarch64)
+#define PG_ARCH_LOONGARCH_32
+#elif defined(__loongarch) && defined(__loongarch64)
+#define PG_ARCH_LOONGARCH_64
+#elif defined(__mips__)
+#define PG_ARCH_MIPS_32
+#elif defined(__mips64__)
+#define PG_ARCH_MIPS_64
+#elif defined(__ppc__) || defined(__powerpc__)
+#define PG_ARCH_PPC_32
+#elif defined(__ppc64__) || defined(__powerpc64__)
+#define PG_ARCH_PPC_64
+#elif defined(__riscv__)
+#define PG_ARCH_RISCV_32
+#elif defined(__riscv64__)
+#define PG_ARCH_RISCV_64
+#elif defined(__s390__)
+#define PG_ARCH_S390_32
+#elif defined(__s390x__)
+#define PG_ARCH_S390_64
+#elif defined(__sparc)
+#define PG_ARCH_SPARC_32
+#elif defined(__sparcv9)
+#define PG_ARCH_SPARC_64
+#elif defined(__i386__) || defined (__386) || defined(_M_IX86)
+#define PG_ARCH_X86_32
+#elif defined(__x86_64__) || defined(__x86_64) || defined (__amd64)
+#define PG_ARCH_X86_64
+#endif
+
+/* Same again without specifying the word size. */
+#if defined(PG_ARCH_ARM_32) || defined(PG_ARCH_ARM_64)
+#define PG_ARCH_ARM
+#elif defined(PG_ARCH_LOONGARCH_32) || defined(PG_ARCH_LOONGARCH_64)
+#define PG_ARCH_LOONGARCH
+#elif defined(PG_ARCH_MIPS_32) || defined(PG_ARCH_MIPS_64)
+#define PG_ARCH_MIPS
+#elif defined(PG_ARCH_PPC_32) || defined(PG_ARCH_PPC_64)
+#define PG_ARCH_PPC
+#elif defined(PG_ARCH_RISCV_32) || defined(PG_ARCH_RISCV_64)
+#define PG_ARCH_RISCV
+#elif defined(PG_ARCH_S390_32) || defined(PG_ARCH_S390_64)
+#define PG_ARCH_S390
+#elif defined(PG_ARCH_SPARC_32) || defined(PG_ARCH_SPARC_64)
+#define PG_ARCH_SPARC
+#elif defined(PG_ARCH_X86_32) || defined(PG_ARCH_X86_64)
+#define PG_ARCH_X86
+#endif
+
 
 /* ----------------------------------------------------------------
  *				Section 2:	bool, true, false
diff --git a/src/include/jit/llvmjit_backport.h b/src/include/jit/llvmjit_backport.h
index 71cfdfc832f..be6fbd64773 100644
--- a/src/include/jit/llvmjit_backport.h
+++ b/src/include/jit/llvmjit_backport.h
@@ -15,7 +15,7 @@
  * class llvm::backport::SectionMemoryManager that we use as a workaround.
  * This header controls whether we use it.
  */
-#if defined(__aarch64__) && LLVM_VERSION_MAJOR < 22
+#if defined(PG_ARCH_ARM_64) && LLVM_VERSION_MAJOR < 22
 #define USE_LLVM_BACKPORT_SECTION_MEMORY_MANAGER
 #endif
 
diff --git a/src/include/port/atomics.h b/src/include/port/atomics.h
index d8b1d20fe60..8bd3b0d6dc9 100644
--- a/src/include/port/atomics.h
+++ b/src/include/port/atomics.h
@@ -63,11 +63,11 @@
  * compiler barrier.
  *
  */
-#if defined(__arm__) || defined(__arm) || defined(__aarch64__)
+#if defined(PG_ARCH_ARM)
 #include "port/atomics/arch-arm.h"
-#elif defined(__i386__) || defined(__i386) || defined(__x86_64__)
+#elif defined(PG_ARCH_X86)
 #include "port/atomics/arch-x86.h"
-#elif defined(__ppc__) || defined(__powerpc__) || defined(__ppc64__) || defined(__powerpc64__)
+#elif defined(PG_ARCH_PPC)
 #include "port/atomics/arch-ppc.h"
 #endif
 
diff --git a/src/include/port/atomics/arch-arm.h b/src/include/port/atomics/arch-arm.h
index 90280c7b751..4da56fab10e 100644
--- a/src/include/port/atomics/arch-arm.h
+++ b/src/include/port/atomics/arch-arm.h
@@ -21,7 +21,7 @@
  * 64 bit atomics on ARM32 are implemented using kernel fallbacks and thus
  * might be slow, so disable entirely. On ARM64 that problem doesn't exist.
  */
-#if !defined(__aarch64__)
+#if !defined(PG_ARCH_ARM_64)
 #define PG_DISABLE_64_BIT_ATOMICS
 #else
 /*
@@ -29,4 +29,4 @@
  * general purpose register is atomic.
  */
 #define PG_HAVE_8BYTE_SINGLE_COPY_ATOMICITY
-#endif  /* __aarch64__ */
+#endif  /* PG_ARCH_ARM_64 */
diff --git a/src/include/port/atomics/arch-x86.h b/src/include/port/atomics/arch-x86.h
index bd6f4f56ca2..05bb27c6ae5 100644
--- a/src/include/port/atomics/arch-x86.h
+++ b/src/include/port/atomics/arch-x86.h
@@ -32,10 +32,10 @@
  */
 
 #if defined(__GNUC__) || defined(__INTEL_COMPILER)
-#if defined(__i386__) || defined(__i386)
+#if defined(PG_ARCH_X86_32)
 #define pg_memory_barrier_impl()		\
 	__asm__ __volatile__ ("lock; addl $0,0(%%esp)" : : : "memory", "cc")
-#elif defined(__x86_64__)
+#elif defined(PG_ARCH_X86_64)
 #define pg_memory_barrier_impl()		\
 	__asm__ __volatile__ ("lock; addl $0,0(%%rsp)" : : : "memory", "cc")
 #endif
@@ -67,14 +67,14 @@ typedef struct pg_atomic_uint32
  * It's too complicated to write inline asm for 64bit types on 32bit and the
  * 486 can't do it anyway.
  */
-#ifdef __x86_64__
+#ifdef PG_ARCH_X86_64
 #define PG_HAVE_ATOMIC_U64_SUPPORT
 typedef struct pg_atomic_uint64
 {
 	/* alignment guaranteed due to being on a 64bit platform */
 	volatile uint64 value;
 } pg_atomic_uint64;
-#endif	/* __x86_64__ */
+#endif	/* PG_ARCH_X86_64 */
 
 #endif /* defined(__GNUC__) || defined(__INTEL_COMPILER) */
 
@@ -109,7 +109,7 @@ pg_spin_delay_impl(void)
 {
 	__asm__ __volatile__(" rep; nop			\n");
 }
-#elif defined(_MSC_VER) && defined(__x86_64__)
+#elif defined(_MSC_VER) && defined(PG_ARCH_X86_64)
 #define PG_HAVE_SPIN_DELAY
 static __forceinline void
 pg_spin_delay_impl(void)
@@ -192,7 +192,7 @@ pg_atomic_fetch_add_u32_impl(volatile pg_atomic_uint32 *ptr, int32 add_)
 	return res;
 }
 
-#ifdef __x86_64__
+#ifdef PG_ARCH_X86_64
 
 #define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U64
 static inline bool
@@ -231,7 +231,7 @@ pg_atomic_fetch_add_u64_impl(volatile pg_atomic_uint64 *ptr, int64 add_)
 	return res;
 }
 
-#endif /* __x86_64__ */
+#endif /* PG_ARCH_X86_64 */
 
 #endif /* defined(__GNUC__) || defined(__INTEL_COMPILER) */
 
@@ -241,6 +241,6 @@ pg_atomic_fetch_add_u64_impl(volatile pg_atomic_uint64 *ptr, int64 add_)
  */
 #if defined(__i568__) || defined(__i668__) || /* gcc i586+ */  \
 	(defined(_M_IX86) && _M_IX86 >= 500) || /* msvc i586+ */ \
-	defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) /* gcc, msvc */
+	defined(PG_ARCH_X86_64)
 #define PG_HAVE_8BYTE_SINGLE_COPY_ATOMICITY
 #endif /* 8 byte single-copy atomicity */
diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index 7a00d197013..0ca0c986113 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -82,7 +82,7 @@ pg_leftmost_one_pos64(uint64 word)
 #error "cannot find integer type of the same size as uint64_t"
 #endif
 
-#elif defined(_MSC_VER) && (defined(_M_AMD64) || defined(_M_ARM64))
+#elif defined(_MSC_VER) && (defined(PG_ARCH_ARM_64) || defined(PG_ARCH_X86_64))
 	unsigned long result;
 	bool		non_zero;
 
@@ -155,7 +155,7 @@ pg_rightmost_one_pos64(uint64 word)
 #error "cannot find integer type of the same size as uint64_t"
 #endif
 
-#elif defined(_MSC_VER) && (defined(_M_AMD64) || defined(_M_ARM64))
+#elif defined(_MSC_VER) && (defined(PG_ARCH_ARM_64) || defined(PG_ARCH_X86_64))
 	unsigned long result;
 	bool		non_zero;
 
diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h
index 92558e234ac..a8834defa86 100644
--- a/src/include/portability/instr_time.h
+++ b/src/include/portability/instr_time.h
@@ -95,7 +95,7 @@ typedef struct instr_time
  * PG_INSTR_TSC_CLOCK controls whether the TSC clock source is compiled in, and
  * potentially used based on timing_tsc_enabled.
  */
-#if defined(__x86_64__) || defined(_M_X64)
+#if defined(PG_ARCH_X86_64)
 #define PG_INSTR_TICKS_TO_NS 1
 #define PG_INSTR_TSC_CLOCK 1
 #elif defined(WIN32)
diff --git a/src/include/storage/s_lock.h b/src/include/storage/s_lock.h
index 28f83df96d6..4229af8965f 100644
--- a/src/include/storage/s_lock.h
+++ b/src/include/storage/s_lock.h
@@ -193,7 +193,7 @@ spin_delay(void)
 #endif	 /* __i386__ */
 
 
-#ifdef __x86_64__		/* AMD Opteron, Intel EM64T */
+#ifdef PG_ARCH_X86_64		/* AMD Opteron, Intel EM64T */
 #define HAS_TEST_AND_SET
 
 typedef unsigned char slock_t;
@@ -238,7 +238,7 @@ spin_delay(void)
 		" rep; nop			\n");
 }
 
-#endif	 /* __x86_64__ */
+#endif	 /* PG_ARCH_X86_64 */
 
 
 /*
@@ -247,7 +247,7 @@ spin_delay(void)
  * We use the int-width variant of the builtin because it works on more chips
  * than other widths.
  */
-#if defined(__arm__) || defined(__arm) || defined(__aarch64__)
+#if defined(PG_ARCH_ARM)
 #ifdef HAVE_GCC__SYNC_INT32_TAS
 #define HAS_TEST_AND_SET
 
@@ -263,7 +263,7 @@ tas(volatile slock_t *lock)
 
 #define S_UNLOCK(lock) __sync_lock_release(lock)
 
-#if defined(__aarch64__)
+#if defined(PG_ARCH_ARM_64)
 
 /*
  * On ARM64, it's a win to use a non-locking test before the TAS proper.  It
@@ -285,9 +285,9 @@ spin_delay(void)
 		" isb;				\n");
 }
 
-#endif	 /* __aarch64__ */
+#endif	 /* PG_ARCH_ARM_64 */
 #endif	 /* HAVE_GCC__SYNC_INT32_TAS */
-#endif	 /* __arm__ || __arm || __aarch64__ */
+#endif	 /* PG_ARCH_ARM */
 
 
 /* S/390 and S/390x Linux (32- and 64-bit zSeries) */
@@ -391,7 +391,7 @@ do \
 
 
 /* PowerPC */
-#if defined(__ppc__) || defined(__powerpc__) || defined(__ppc64__) || defined(__powerpc64__)
+#if defined(PG_ARCH_PPC)
 #define HAS_TEST_AND_SET
 
 typedef unsigned int slock_t;
@@ -452,7 +452,7 @@ do \
 #endif /* powerpc */
 
 
-#if defined(__mips__)
+#if defined(PG_ARCH_MIPS)
 #define HAS_TEST_AND_SET
 
 typedef unsigned int slock_t;
diff --git a/src/port/pg_crc32c_armv8_choose.c b/src/port/pg_crc32c_armv8_choose.c
index 591e23df44b..ff9afd55e7d 100644
--- a/src/port/pg_crc32c_armv8_choose.c
+++ b/src/port/pg_crc32c_armv8_choose.c
@@ -27,14 +27,14 @@
 #if defined(HAVE_ELF_AUX_INFO) || defined(HAVE_GETAUXVAL)
 #include <sys/auxv.h>
 /* Ancient glibc releases don't include the HWCAPxxx macros in sys/auxv.h */
-#if defined(__linux__) && (defined(__aarch64__) ? !defined(HWCAP_CRC32) : !defined(HWCAP2_CRC32))
+#if defined(__linux__) && (defined(PG_ARCH_ARM_64) ? !defined(HWCAP_CRC32) : !defined(HWCAP2_CRC32))
 #include <asm/hwcap.h>
 #endif
 #endif
 
 #if defined(__NetBSD__)
 #include <sys/sysctl.h>
-#if defined(__aarch64__)
+#if defined(PG_ARCH_ARM_64)
 #include <aarch64/armreg.h>
 #endif
 #endif
@@ -47,7 +47,7 @@ pg_crc32c_armv8_available(void)
 #if defined(HAVE_ELF_AUX_INFO)
 	unsigned long value;
 
-#ifdef __aarch64__
+#ifdef PG_ARCH_ARM_64
 	return elf_aux_info(AT_HWCAP, &value, sizeof(value)) == 0 &&
 		(value & HWCAP_CRC32) != 0;
 #else
@@ -55,7 +55,7 @@ pg_crc32c_armv8_available(void)
 		(value & HWCAP2_CRC32) != 0;
 #endif
 #elif defined(HAVE_GETAUXVAL)
-#ifdef __aarch64__
+#ifdef PG_ARCH_ARM_64
 	return (getauxval(AT_HWCAP) & HWCAP_CRC32) != 0;
 #else
 	return (getauxval(AT_HWCAP2) & HWCAP2_CRC32) != 0;
@@ -74,7 +74,7 @@ pg_crc32c_armv8_available(void)
 
 	size_t		len;
 	uint64		sysctlbuf[SYSCTL_CPU_ID_MAXSIZE];
-#if defined(__aarch64__)
+#if defined(PG_ARCH_ARM_64)
 	/* We assume cpu0 is representative of all the machine's CPUs. */
 	const char *path = "machdep.cpu0.cpu_id";
 	size_t		expected_len = sizeof(struct aarch64_sysctl_cpu_id);
@@ -112,7 +112,7 @@ pg_crc32c_armv8_available(void)
 static bool
 pg_pmull_available(void)
 {
-#if defined(__aarch64__) && defined(HWCAP_PMULL)
+#if defined(PG_ARCH_ARM_64) && defined(HWCAP_PMULL)
 
 #ifdef HAVE_ELF_AUX_INFO
 	unsigned long value;
diff --git a/src/port/pg_crc32c_sse42.c b/src/port/pg_crc32c_sse42.c
index b8e77faf4d9..f0759ffcb26 100644
--- a/src/port/pg_crc32c_sse42.c
+++ b/src/port/pg_crc32c_sse42.c
@@ -39,7 +39,7 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
 	 * and performance testing didn't show any performance gain from aligning
 	 * the begin address.
 	 */
-#ifdef __x86_64__
+#ifdef PG_ARCH_X86_64
 	while (p + 8 <= pend)
 	{
 		crc = (uint32) _mm_crc32_u64(crc, *((const uint64 *) p));
@@ -63,7 +63,7 @@ pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len)
 		crc = _mm_crc32_u32(crc, *((const unsigned int *) p));
 		p += 4;
 	}
-#endif							/* __x86_64__ */
+#endif							/* PG_ARCH_X86_64 */
 
 	/* Process any remaining bytes one at a time. */
 	while (p < pend)
-- 
2.53.0

