From 1528a27f9a8331003a82cd645aabd876882b85ca Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <peter.geoghegan86@gmail.com>
Date: Fri, 3 Jul 2015 13:48:08 -0700
Subject: [PATCH 2/2] Add several text sorting optimizations

Cache the results of strxfrm() calls made by the text abbreviation
routine, and strcoll() calls made by the text comparison routine (text
SortSupport authoritative comparator only).

Testing shows that the cost of doing this is immeasurably small in cases
where the optimization does not help, while the benefits in cases where
the optimization is effective can be quite significant.

Separately, arrange for text abbreviated keys to be represented as
unsigned integers, rather than a char array.  This is somewhat faster on
at least some platforms, since integer comparisons are now used rather
than memcmp() during sorting proper.
---
 src/backend/utils/adt/varlena.c | 118 +++++++++++++++++++++++++++++++++++-----
 1 file changed, 105 insertions(+), 13 deletions(-)

diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index 2fbbf54..7a4eed7 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -61,6 +61,9 @@ typedef struct
 	char	   *buf2;			/* 2nd string, or abbreviation strxfrm() buf */
 	int			buflen1;
 	int			buflen2;
+	int			last_len1;		/* Length of last buf1 string/strxfrm() blob */
+	int			last_len2;		/* Length of last buf2 string/strxfrm() blob */
+	int			last_returned;	/* Last comparison result (cache) */
 	bool		collate_c;
 	hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
 	hyperLogLogState full_card; /* Full key cardinality state */
@@ -71,6 +74,23 @@ typedef struct
 } TextSortSupport;
 
 /*
+ * Macro generating new representation of char array packed into Datum, making
+ * it a sizeof(Datum)-wide uint (the C type Datum is an unsigned integer).
+ * Comparisons of this representation should behave equivalently to a memcmp()
+ * of the Datum's entire original contents.  This offers a measurable
+ * performance boost on some systems.
+ */
+#ifdef WORDS_BIGENDIAN
+#define string_uint(x)	(x)
+#else									/* !WORDS_BIGENDIAN */
+#if SIZEOF_DATUM == 8
+#define string_uint(x)	BSWAP64(x)
+#else									/* SIZEOF_DATUM != 8 */
+#define string_uint(x)	BSWAP32(x)
+#endif									/* SIZEOF_DATUM == 8 */
+#endif									/* WORDS_BIGENDIAN */
+
+/*
  * This should be large enough that most strings will fit, but small enough
  * that we feel comfortable putting it on the stack
  */
@@ -1831,9 +1851,20 @@ btsortsupport_worker(SortSupport ssup, Oid collid)
 		tss->buflen1 = TEXTBUFLEN;
 		tss->buf2 = palloc(TEXTBUFLEN);
 		tss->buflen2 = TEXTBUFLEN;
+		/* Start with invalid values */
+		tss->last_len1 = -1;
+		tss->last_len2 = -1;
 #ifdef HAVE_LOCALE_T
 		tss->locale = locale;
 #endif
+		/*
+		 * To avoid somehow confusing a strxfrm() blob and an original string
+		 * within bttextfastcmp_locale(), initialize last returned cache to a
+		 * sentinel value.  A platform-specific actual strcoll() return value
+		 * of INT_MIN seems unlikely, but if that occurs it will not cause
+		 * wrong answers.
+		 */
+		tss->last_returned = INT_MIN;
 		tss->collate_c = collate_c;
 		ssup->ssup_extra = tss;
 
@@ -1896,6 +1927,7 @@ bttextfastcmp_locale(Datum x, Datum y, SortSupport ssup)
 {
 	text	   *arg1 = DatumGetTextPP(x);
 	text	   *arg2 = DatumGetTextPP(y);
+	bool		arg1_match;
 	TextSortSupport *tss = (TextSortSupport *) ssup->ssup_extra;
 
 	/* working state */
@@ -1914,6 +1946,11 @@ bttextfastcmp_locale(Datum x, Datum y, SortSupport ssup)
 	/* Fast pre-check for equality, as discussed in varstr_cmp() */
 	if (len1 == len2 && memcmp(a1p, a2p, len1) == 0)
 	{
+		/*
+		 * No change in buf1 or buf2 contents, so avoid changing last_len1 or
+		 * last_len2.  Existing contents of buffers might still be used by next
+		 * call.
+		 */
 		result = 0;
 		goto done;
 	}
@@ -1931,10 +1968,50 @@ bttextfastcmp_locale(Datum x, Datum y, SortSupport ssup)
 		tss->buf2 = MemoryContextAlloc(ssup->ssup_cxt, tss->buflen2);
 	}
 
-	memcpy(tss->buf1, a1p, len1);
-	tss->buf1[len1] = '\0';
-	memcpy(tss->buf2, a2p, len2);
-	tss->buf2[len2] = '\0';
+	/*
+	 * Attempt to re-use buffers across calls.  Also, avoid work in the event
+	 * of clustered together identical items by exploiting temporal locality.
+	 * This works well with divide-and-conquer, comparison-based sorts like
+	 * quicksort and mergesort.
+	 *
+	 * With quicksort, there is, in general, a pretty strong chance that the
+	 * same buffer contents can be used repeatedly for pivot items -- early
+	 * pivot items will account for large number of total comparisons, since
+	 * they must be compared against many (possibly all other) items.
+	 */
+	arg1_match = true;
+	if (len1 != tss->last_len1 || memcmp(tss->buf1, a1p, len1) != 0)
+	{
+		arg1_match = false;
+		memcpy(tss->buf1, a1p, len1);
+		tss->buf1[len1] = '\0';
+		tss->last_len1 = len1;
+	}
+
+	/*
+	 * While it is worth going to the trouble of trying to re-use buffer
+	 * contents across calls, ideally that will lead to entirely avoiding a
+	 * strcoll() call by using a cached return value.
+	 *
+	 * This optimization can work well again and again for the same set of
+	 * clustered together identical attributes;  as they're relocated to new
+	 * subpartitions, only one strcoll() is required for each pivot (in respect
+	 * of that clump of identical values, at least).  Similarly, the final
+	 * N-way merge of a mergesort can be effectively accelerated if each run
+	 * has its own locally clustered values.
+	 */
+	if (len2 != tss->last_len2 || memcmp(tss->buf2, a2p, len2) != 0)
+	{
+		memcpy(tss->buf2, a2p, len2);
+		tss->buf2[len2] = '\0';
+		tss->last_len2 = len2;
+	}
+	else if (arg1_match && tss->last_returned != INT_MIN)
+	{
+		/* Use result cached following last actual strcoll() call */
+		result = tss->last_returned;
+		goto done;
+	}
 
 #ifdef HAVE_LOCALE_T
 	if (tss->locale)
@@ -1951,6 +2028,8 @@ bttextfastcmp_locale(Datum x, Datum y, SortSupport ssup)
 	if (result == 0)
 		result = strcmp(tss->buf1, tss->buf2);
 
+	/* Cache result value, perhaps saving a costly strcoll() in next call */
+	tss->last_returned = result;
 done:
 	/* We can't afford to leak memory here. */
 	if (PointerGetDatum(arg1) != x)
@@ -1967,25 +2046,24 @@ done:
 static int
 bttextcmp_abbrev(Datum x, Datum y, SortSupport ssup)
 {
-	char	   *a = (char *) &x;
-	char	   *b = (char *) &y;
-	int			result;
-
-	result = memcmp(a, b, sizeof(Datum));
-
 	/*
-	 * When result = 0, the core system will call bttextfastcmp_c() or
+	 * When 0 is returned, the core system will call bttextfastcmp_c() or
 	 * bttextfastcmp_locale().  Even a strcmp() on two non-truncated strxfrm()
 	 * blobs cannot indicate *equality* authoritatively, for the same reason
 	 * that there is a strcoll() tie-breaker call to strcmp() in varstr_cmp().
 	 */
-	return result;
+	if (x > y)
+		return 1;
+	else if (x == y)
+		return 0;
+	else
+		return -1;
 }
 
 /*
  * Conversion routine for sortsupport.  Converts original text to abbreviated
  * key representation.  Our encoding strategy is simple -- pack the first 8
- * bytes of a strxfrm() blob into a Datum.
+ * bytes of a strxfrm() blob into a Datum, and treat it as an unsigned integer.
  */
 static Datum
 bttext_abbrev_convert(Datum original, SortSupport ssup)
@@ -2033,9 +2111,19 @@ bttext_abbrev_convert(Datum original, SortSupport ssup)
 			tss->buf1 = palloc(tss->buflen1);
 		}
 
+		/* Might be able to reuse strxfrm() blob from last call */
+		if (tss->last_len1 == len &&
+			memcmp(tss->buf1, authoritative_data, len) == 0)
+		{
+			memcpy(pres, tss->buf2, Min(sizeof(Datum), tss->last_len2));
+			/* No change affecting cardinality, so no hashing required */
+			goto done;
+		}
+
 		/* Just like strcoll(), strxfrm() expects a NUL-terminated string */
 		memcpy(tss->buf1, authoritative_data, len);
 		tss->buf1[len] = '\0';
+		tss->last_len1 = len;
 
 		for (;;)
 		{
@@ -2047,6 +2135,7 @@ bttext_abbrev_convert(Datum original, SortSupport ssup)
 #endif
 				bsize = strxfrm(tss->buf2, tss->buf1, tss->buflen2);
 
+			tss->last_len2 = bsize;
 			if (bsize < tss->buflen2)
 				break;
 
@@ -2104,6 +2193,9 @@ bttext_abbrev_convert(Datum original, SortSupport ssup)
 
 	addHyperLogLog(&tss->abbr_card, hash);
 
+done:
+	/* Represent Datum packed with string as uint */
+	res = string_uint(res);
 	/* Don't leak memory here */
 	if (PointerGetDatum(authoritative) != original)
 		pfree(authoritative);
-- 
1.9.1

