diff --git a/contrib/pg_trgm/trgm_gist.c b/contrib/pg_trgm/trgm_gist.c
index b328a09..1f3d3e3 100644
--- a/contrib/pg_trgm/trgm_gist.c
+++ b/contrib/pg_trgm/trgm_gist.c
@@ -84,17 +84,88 @@ gtrgm_out(PG_FUNCTION_ARGS)
 static void
 makesign(BITVECP sign, TRGM *a)
 {
-	int4		k,
-				len = ARRNELEM(a);
+	int4		len = ARRNELEM(a);
 	trgm	   *ptr = GETARR(a);
-	int4		tmp = 0;
+	char	   *p;
+	char	   *endptr;
+	uint32		w1,
+				w2,
+				w3;
+	uint32		trg1,
+				trg2,
+				trg3,
+				trg4;
+	uint32	   *p32;
 
 	MemSet((void *) sign, 0, sizeof(BITVEC));
 	SETBIT(sign, SIGLENBIT);	/* set last unused bit */
-	for (k = 0; k < len; k++)
+
+	if (len == 0)
+		return;
+
+	endptr = (char *) (ptr + len);
+	/*------------------------------------------------------------------------
+	 * We have to extract each trigram into a uint32, and calculate the HASH.
+	 * This would be a lot easier if each trigram was aligned at 4-byte
+	 * boundary, but they're not. The simple way would be to copy each
+	 * trigram byte-per-byte, but that is quite slow, and this function is a
+	 * hotspot in penalty calculation.
+	 *
+	 * The first trigram in the array doesn't begin at a 4-byte boundary, the
+	 * flags byte comes first, but the next one does. So we fetch the first
+	 * trigram as a special case, and after that the following four trigrams
+	 * fall onto 4-byte words like this:
+	 *
+	 *  w1   w2   w3
+	 * AAAB BBCC CDDD
+	 *
+	 * As long as there's at least four trigrams left to process, we fetch
+	 * the next three words and extract the trigrams from them with bit
+	 * operations.
+	 *------------------------------------------------------------------------
+	 */
+	p32 = (uint32 *) (((char *) ptr) - 1);
+
+	/* Fetch and extract the initial word */
+	w1 = *(p32++);
+#ifdef WORDS_BIGENDIAN
+	trg1 = w1 << 8;
+#else
+	trg1 = w1 >> 8;
+#endif
+	HASH(sign, trg1);
+
+	while((char *) p32 < endptr - 12)
 	{
-		CPTRGM(((char *) &tmp), ptr + k);
-		HASH(sign, tmp);
+		w1 = *(p32++);
+		w2 = *(p32++);
+		w3 = *(p32++);
+
+#ifdef WORDS_BIGENDIAN
+		trg1 = w1 & 0xFFFFFF00;
+		trg2 = (w1 << 24) | ((w2 & 0xFFFF0000) >> 8);
+		trg3 = ((w2 & 0x0000FFFF) << 16) | ((w3 & 0xFF000000) >> 16);
+		trg4 = w3 << 8;
+#else
+		trg1 = w1 & 0x00FFFFFF;
+		trg2 = (w1 >> 24) | ((w2 & 0x0000FFFF) << 8);
+		trg3 = ((w2 & 0xFFFF0000) >> 16) | ((w3 & 0x000000FF) << 16);
+		trg4 = w3 >> 8;
+#endif
+
+		HASH(sign, trg1);
+		HASH(sign, trg2);
+		HASH(sign, trg3);
+		HASH(sign, trg4);
+	}
+
+	/* Handle remaining 1-3 trigrams the slow way */
+	p = (char *) p32;
+	while (p < endptr)
+	{
+		CPTRGM(((char *) &trg1), p);
+		HASH(sign, trg1);
+		p += 3;
 	}
 }
 
