*** a/contrib/pg_trgm/trgm_gist.c
--- b/contrib/pg_trgm/trgm_gist.c
***************
*** 82,88 **** gtrgm_out(PG_FUNCTION_ARGS)
  }
  
  static void
! makesign(BITVECP sign, TRGM *a)
  {
  	int4		k,
  				len = ARRNELEM(a);
--- 82,88 ----
  }
  
  static void
! origmakesign(BITVECP sign, TRGM *a)
  {
  	int4		k,
  				len = ARRNELEM(a);
***************
*** 98,103 **** makesign(BITVECP sign, TRGM *a)
--- 98,316 ----
  	}
  }
  
+ /*
+  * Lookup tables for setting nth bit in a 128-bit bit array, where the
+  * bit array is stored in two 64-bit integers. That representation is used
+  * inside makesign(), elsewhere the signature is stored in a BITVEC, ie.
+  * a byte array, in little-endian order. To be compatible with the byte
+  * array representation, the 64-bit integers are in little-endian order
+  * regardless of the native endianness of the platform.
+  */
+ #ifdef WORDS_BIGENDIAN
+ #define SET_NTH_BIT_LOOKUP						\
+ 	1LL<<0x38, 1LL<<0x39, 1LL<<0x3A, 1LL<<0x3B, \
+ 	1LL<<0x3C, 1LL<<0x3D, 1LL<<0x3E, 1LL<<0x3F, \
+ 												\
+ 	1LL<<0x30, 1LL<<0x31, 1LL<<0x32, 1LL<<0x33, \
+ 	1LL<<0x34, 1LL<<0x35, 1LL<<0x36, 1LL<<0x37, \
+ 												\
+ 	1LL<<0x28, 1LL<<0x29, 1LL<<0x2A, 1LL<<0x2B, \
+ 	1LL<<0x2C, 1LL<<0x2D, 1LL<<0x2E, 1LL<<0x2F, \
+ 												\
+ 	1LL<<0x20, 1LL<<0x21, 1LL<<0x22, 1LL<<0x23, \
+ 	1LL<<0x24, 1LL<<0x25, 1LL<<0x26, 1LL<<0x27, \
+ 												\
+ 	1LL<<0x18, 1LL<<0x19, 1LL<<0x1A, 1LL<<0x1B, \
+ 	1LL<<0x1C, 1LL<<0x1D, 1LL<<0x1E, 1LL<<0x1F, \
+ 												\
+ 	1LL<<0x10, 1LL<<0x11, 1LL<<0x12, 1LL<<0x13, \
+ 	1LL<<0x14, 1LL<<0x15, 1LL<<0x16, 1LL<<0x17, \
+ 												\
+ 	1LL<<0x08, 1LL<<0x09, 1LL<<0x0A, 1LL<<0x0B, \
+ 	1LL<<0x0C, 1LL<<0x0D, 1LL<<0x0E, 1LL<<0x0F, \
+ 												\
+ 	1LL<<0x00, 1LL<<0x01, 1LL<<0x02, 1LL<<0x03, \
+ 	1LL<<0x04, 1LL<<0x05, 1LL<<0x06, 1LL<<0x07
+ #else
+ #define SET_NTH_BIT_LOOKUP						\
+ 	1LL<<0x00, 1LL<<0x01, 1LL<<0x02, 1LL<<0x03, \
+ 	1LL<<0x04, 1LL<<0x05, 1LL<<0x06, 1LL<<0x07, \
+ 												\
+ 	1LL<<0x08, 1LL<<0x09, 1LL<<0x0A, 1LL<<0x0B, \
+ 	1LL<<0x0C, 1LL<<0x0D, 1LL<<0x0E, 1LL<<0x0F, \
+ 												\
+ 	1LL<<0x10, 1LL<<0x11, 1LL<<0x12, 1LL<<0x13, \
+ 	1LL<<0x14, 1LL<<0x15, 1LL<<0x16, 1LL<<0x17, \
+ 												\
+ 	1LL<<0x18, 1LL<<0x19, 1LL<<0x1A, 1LL<<0x1B, \
+ 	1LL<<0x1C, 1LL<<0x1D, 1LL<<0x1E, 1LL<<0x1F, \
+ 												\
+ 	1LL<<0x20, 1LL<<0x21, 1LL<<0x22, 1LL<<0x23, \
+ 	1LL<<0x24, 1LL<<0x25, 1LL<<0x26, 1LL<<0x27, \
+ 												\
+ 	1LL<<0x28, 1LL<<0x29, 1LL<<0x2A, 1LL<<0x2B, \
+ 	1LL<<0x2C, 1LL<<0x2D, 1LL<<0x2E, 1LL<<0x2F, \
+ 												\
+ 	1LL<<0x30, 1LL<<0x31, 1LL<<0x32, 1LL<<0x33, \
+ 	1LL<<0x34, 1LL<<0x35, 1LL<<0x36, 1LL<<0x37, \
+ 												\
+ 	1LL<<0x38, 1LL<<0x39, 1LL<<0x3A, 1LL<<0x3B, \
+ 	1LL<<0x3C, 1LL<<0x3D, 1LL<<0x3E, 1LL<<0x3F
+ #endif
+ 
+ /* Lookup tables for setting the low and high 64-bits of the signature. */
+ static const uint64 lowsbox[128] =
+ {
+ 	/* 0-63 */
+ 	SET_NTH_BIT_LOOKUP,
+ 
+ 	/* 64-SIGLENBIT are zeros */
+ 	0, 0, 0, 0,  0, 0, 0, 0,   0, 0, 0, 0,  0, 0, 0, 0,
+ 	0, 0, 0, 0,  0, 0, 0, 0,   0, 0, 0, 0,  0, 0, 0, 0,
+ 
+ 	0, 0, 0, 0,  0, 0, 0, 0,   0, 0, 0, 0,  0, 0, 0, 0,
+ 	0, 0, 0, 0,  0, 0, 0, 0,   0, 0, 0, 0,  0, 0, 0, 0
+ };
+ 
+ static const uint64 highsbox[128] =
+ {
+ 	/* 0-63 are zeros. */
+ 	0, 0, 0, 0,  0, 0, 0, 0,   0, 0, 0, 0,  0, 0, 0, 0,
+ 	0, 0, 0, 0,  0, 0, 0, 0,   0, 0, 0, 0,  0, 0, 0, 0,
+ 
+ 	0, 0, 0, 0,  0, 0, 0, 0,   0, 0, 0, 0,  0, 0, 0, 0,
+ 	0, 0, 0, 0,  0, 0, 0, 0,   0, 0, 0, 0,  0, 0, 0, 0,
+ 
+ 	/* 64-SIGLENBIT */
+ 	SET_NTH_BIT_LOOKUP,
+ };
+ 
+ /* Set the nth bit in the signature, stored in two uint64s. */
+ #define SETBIT_S(high, low, hash)		\
+ 	do {								\
+ 		unsigned int n = (hash);		\
+ 		high |= highsbox[n];			\
+ 		low |= lowsbox[n];				\
+ 	} while(0)
+ 
+ static void
+ makesign(BITVECP sign, TRGM *a)
+ {
+ 	int4		len = ARRNELEM(a);
+ 	trgm	   *ptr = GETARR(a);
+ 	char	   *p;
+ 	char	   *endptr;
+ 	uint32		w1,
+ 				w2,
+ 				w3;
+ 	uint32		trg0 = 0,
+ 				trg1,
+ 				trg2,
+ 				trg3,
+ 				trg4;
+ 	uint32	   *p32;
+ 
+ 	/*
+ 	 * s1 and s2 contain the signature we're calcuting. Together they
+ 	 * form one bit-array of 128 bits.
+ 	 */
+ 	uint64		highsig = 0;
+ 	uint64		lowsig = 0;
+ 
+ 	SETBIT_S(highsig, lowsig, SIGLENBIT);	/* set last unused bit */
+ 
+ 	if (len <= 0)
+ 		goto end;
+ 
+ 	/*----------
+ 	 * We have to extract each trigram into a uint32, and calculate the HASH.
+ 	 * This would be a lot easier if the trigrams were aligned on 4-byte
+ 	 * boundaries, but they're not.  The simple way would be to copy each
+ 	 * trigram byte-by-byte, but that is quite slow, and this function is a
+ 	 * hotspot in penalty calculations.
+ 	 *
+ 	 * The first trigram in the array doesn't begin at a 4-byte boundary, as
+ 	 * the flags byte comes first; but the next one does.  So we fetch the
+ 	 * first trigram as a special case, and after that each four trigrams fall
+ 	 * onto 4-byte words like this:
+ 	 *
+ 	 *  w1   w2   w3
+ 	 * AAAB BBCC CDDD
+ 	 *
+ 	 * As long as there's at least four trigrams left to process, we fetch
+ 	 * the next three words and extract the trigrams from them with bit
+ 	 * operations, per the above diagram.  The last few trigrams are handled
+ 	 * one at a time with byte-by-byte fetching.
+ 	 *
+ 	 * Note that this code yields different results on big-endian and
+ 	 * little-endian machines, because the bytes of each trigram are loaded
+ 	 * into a uint32 in memory order and left-justified.  That's probably
+ 	 * undesirable, but changing this behavior would break existing indexes.
+ 	 *----------
+ 	 */
+ 	endptr = (char *) (ptr + len);
+ 	p32 = (uint32 *) (((char *) ptr) - 1);
+ 
+ 	/* Fetch and extract the initial word */
+ 	w1 = *(p32++);
+ #ifdef WORDS_BIGENDIAN
+ 	trg1 = w1 << 8;
+ #else
+ 	trg1 = w1 >> 8;
+ #endif
+ 	SETBIT_S(highsig, lowsig, HASHVAL(trg1));
+ 
+ 	while ((char *) p32 <= endptr - 3 * sizeof(uint32))
+ 	{
+ 		w1 = *(p32++);
+ 		w2 = *(p32++);
+ 		w3 = *(p32++);
+ 
+ #ifdef WORDS_BIGENDIAN
+ 		trg1 = w1 & 0xFFFFFF00;
+ 		trg2 = (w1 << 24) | ((w2 & 0xFFFF0000) >> 8);
+ 		trg3 = ((w2 & 0x0000FFFF) << 16) | ((w3 & 0xFF000000) >> 16);
+ 		trg4 = w3 << 8;
+ #else
+ 		trg1 = w1 & 0x00FFFFFF;
+ 		trg2 = (w1 >> 24) | ((w2 & 0x0000FFFF) << 8);
+ 		trg3 = ((w2 & 0xFFFF0000) >> 16) | ((w3 & 0x000000FF) << 16);
+ 		trg4 = w3 >> 8;
+ #endif
+ 
+ 		SETBIT_S(highsig, lowsig, HASHVAL(trg1));
+ 		SETBIT_S(highsig, lowsig, HASHVAL(trg2));
+ 		SETBIT_S(highsig, lowsig, HASHVAL(trg3));
+ 		SETBIT_S(highsig, lowsig, HASHVAL(trg4));
+ 	}
+ 
+ 	/* Handle the remaining 0-3 trigrams the slow way */
+ 	p = (char *) p32;
+ 	while (p < endptr)
+ 	{
+ 		CPTRGM(((char *) &trg0), p);
+ 		SETBIT_S(highsig, lowsig, HASHVAL(trg0));
+ 		p += 3;
+ 	}
+ 
+ end:
+ 	memcpy((char  *) sign, &lowsig, sizeof(uint64));
+ 	memcpy(((char *) sign) + sizeof(uint64), &highsig, sizeof(uint64));
+ 
+ #ifdef VERIFY_SIG
+ 	{
+ 		BITVEC osign;
+ 		origmakesign(osign, a);
+ 
+ 		if (memcmp(sign, osign, sizeof(BITVEC)) != 0)
+ 		{
+ 			elog(LOG, "mismatch: %lx %lx", highsig, lowsig);
+ 			elog(LOG, "orig: %lx %lx", ((uint64 *)osign)[0], ((uint64 *)osign)[1]);
+ 		}
+ 	}
+ #endif
+ }
+ 
  Datum
  gtrgm_compress(PG_FUNCTION_ARGS)
  {
