From c87da5330c636e939983b1ba8eaee581b4c953dd Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <peter.geoghegan86@gmail.com>
Date: Sun, 29 Nov 2015 12:51:36 -0800
Subject: [PATCH] Abort C collation text abbreviation less frequently

Discriminate against the C collation by creating a much lower bar for
the amount of entropy that abbreviated keys must capture.  This is
consistent with existing cases that have cheaper conversion processes,
like UUID.

Backpatch to 9.5, where abbreviated keys for text were added.
---
 src/backend/utils/adt/varlena.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index a89f586..0bcdd96 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -1869,7 +1869,7 @@ btsortsupport_worker(SortSupport ssup, Oid collid)
 		 */
 		if (abbreviate)
 		{
-			tss->prop_card = 0.20;
+			tss->prop_card = collate_c ? 0.01 : 0.20;
 			initHyperLogLog(&tss->abbr_card, 10);
 			initHyperLogLog(&tss->full_card, 10);
 			ssup->abbrev_full_comparator = ssup->comparator;
@@ -2261,7 +2261,11 @@ bttext_abbrev_abort(int memtupcount, SortSupport ssup)
 	 * cardinality against the overall size of the set in order to more
 	 * accurately model costs.  Assume that an abbreviated comparison, and an
 	 * abbreviated comparison with a cheap memcmp()-based authoritative
-	 * resolution are equivalent.
+	 * resolution are equivalent.  (With the C collation, authoritative
+	 * cardinality is used in the same way, even though the cost of an
+	 * authoritative tie-breaker is no cheaper when values are equal.  The
+	 * theory is that the early appearance of low entropy abbreviated keys
+	 * predicts the same prefix for all or most values.)
 	 */
 	if (abbrev_distinct > key_distinct * tss->prop_card)
 	{
-- 
1.9.1

