Consolidate 'unique array values' logic into a reusable function?

Started by Thomas Munroover 9 years ago8 messages
#1Thomas Munro
thomas.munro@enterprisedb.com
1 attachment(s)

Hi,

Looking at commits f10eab73d and c50d192c, I wondered why we don't
have a reusable in-place unique function. It may be trivial, but we
seem to have a lot of copies and variations in the tree.

Here's a sketch patch that creates a function array_unique which takes
the same arguments as qsort or qsort_arg and returns the new length.
The patch replaces all the specialised unique functions and open coded
versions that I could find with simple greps, but there are probably
more.

My compiler seems to inline the comparator function and memcpy well,
so I can't measure any speed difference between array_unique(array,
size, sizeof(int), compare_int) and a hand-crafted loop using == for
comparison and = for assignment, for a billion items.

If no one objects I'll post a version of this to a commitfest, along
with some other trivial code duplication refactoring work I posted a
while back that consolidates popcount and ffs/fls implementations. I
don't like code duplication :-)

--
Thomas Munro
http://www.enterprisedb.com

Attachments:

array-unique.patchapplication/octet-stream; name=array-unique.patchDownload
diff --git a/contrib/hstore/hstore_io.c b/contrib/hstore/hstore_io.c
index 0c1d99a..0e7d02f 100644
--- a/contrib/hstore/hstore_io.c
+++ b/contrib/hstore/hstore_io.c
@@ -323,6 +323,10 @@ hstoreUniquePairs(Pairs *a, int32 l, int32 *buflen)
 	}
 
 	qsort((void *) a, l, sizeof(Pairs), comparePairs);
+	/*
+	 * We can't use array_unique here because we have some clean-up code to
+	 * run on removed elements.
+	 */
 	ptr = a + 1;
 	res = a;
 	while (ptr - a < l)
diff --git a/contrib/intarray/_int_tool.c b/contrib/intarray/_int_tool.c
index 3c52912..ed38883 100644
--- a/contrib/intarray/_int_tool.c
+++ b/contrib/intarray/_int_tool.c
@@ -4,6 +4,7 @@
 #include "postgres.h"
 
 #include "catalog/pg_type.h"
+#include "lib/arrayutils.h"
 
 #include "_int.h"
 
@@ -293,23 +294,13 @@ internal_size(int *a, int len)
 ArrayType *
 _int_unique(ArrayType *r)
 {
-	int		   *tmp,
-			   *dr,
-			   *data;
 	int			num = ARRNELEMS(r);
+	bool		duplicates_found; /* not used */
 
-	if (num < 2)
-		return r;
+	num = array_unique_arg(ARRPTR(r), num, sizeof(int), isort_cmp,
+						   &duplicates_found);
 
-	data = tmp = dr = ARRPTR(r);
-	while (tmp - data < num)
-	{
-		if (*tmp != *dr)
-			*(++dr) = *tmp++;
-		else
-			tmp++;
-	}
-	return resize_intArrayType(r, dr + 1 - ARRPTR(r));
+	return resize_intArrayType(r, num);
 }
 
 void
diff --git a/contrib/pg_trgm/trgm_op.c b/contrib/pg_trgm/trgm_op.c
index dd0f492..e5f9288 100644
--- a/contrib/pg_trgm/trgm_op.c
+++ b/contrib/pg_trgm/trgm_op.c
@@ -8,6 +8,7 @@
 #include "trgm.h"
 
 #include "catalog/pg_type.h"
+#include "lib/arrayutils.h"
 #include "tsearch/ts_locale.h"
 #include "utils/lsyscache.h"
 #include "utils/memutils.h"
@@ -111,26 +112,6 @@ comp_trgm(const void *a, const void *b)
 	return CMPTRGM(a, b);
 }
 
-static int
-unique_array(trgm *a, int len)
-{
-	trgm	   *curend,
-			   *tmp;
-
-	curend = tmp = a;
-	while (tmp - a < len)
-		if (CMPTRGM(tmp, curend))
-		{
-			curend++;
-			CPTRGM(curend, tmp);
-			tmp++;
-		}
-		else
-			tmp++;
-
-	return curend + 1 - a;
-}
-
 /*
  * Finds first word in string, returns pointer to the word,
  * endword points to the character after word
@@ -340,7 +321,7 @@ generate_trgm(char *str, int slen)
 	if (len > 1)
 	{
 		qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm);
-		len = unique_array(GETARR(trg), len);
+		len = array_unique(GETARR(trg), len, sizeof(trgm), comp_trgm);
 	}
 
 	SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
@@ -853,7 +834,7 @@ generate_wildcard_trgm(const char *str, int slen)
 	if (len > 1)
 	{
 		qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm);
-		len = unique_array(GETARR(trg), len);
+		len = array_unique(GETARR(trg), len, sizeof(trgm), comp_trgm);
 	}
 
 	SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 83c553c..7444c39 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -20,6 +20,7 @@
 #include "access/nbtree.h"
 #include "access/reloptions.h"
 #include "access/relscan.h"
+#include "lib/arrayutils.h"
 #include "miscadmin.h"
 #include "utils/array.h"
 #include "utils/lsyscache.h"
@@ -438,8 +439,6 @@ _bt_sort_array_elements(IndexScanDesc scan, ScanKey skey,
 	Oid			elemtype;
 	RegProcedure cmp_proc;
 	BTSortArrayContext cxt;
-	int			last_non_dup;
-	int			i;
 
 	if (nelems <= 1)
 		return nelems;			/* no work to do */
@@ -478,20 +477,8 @@ _bt_sort_array_elements(IndexScanDesc scan, ScanKey skey,
 			  _bt_compare_array_elements, (void *) &cxt);
 
 	/* Now scan the sorted elements and remove duplicates */
-	last_non_dup = 0;
-	for (i = 1; i < nelems; i++)
-	{
-		int32		compare;
-
-		compare = DatumGetInt32(FunctionCall2Coll(&cxt.flinfo,
-												  cxt.collation,
-												  elems[last_non_dup],
-												  elems[i]));
-		if (compare != 0)
-			elems[++last_non_dup] = elems[i];
-	}
-
-	return last_non_dup + 1;
+	return array_unique_arg(elems, nelems, sizeof(Datum),
+							_bt_compare_array_elements, &cxt);
 }
 
 /*
diff --git a/src/backend/executor/nodeTidscan.c b/src/backend/executor/nodeTidscan.c
index 2604103..bbfc98c 100644
--- a/src/backend/executor/nodeTidscan.c
+++ b/src/backend/executor/nodeTidscan.c
@@ -26,6 +26,7 @@
 #include "catalog/pg_type.h"
 #include "executor/execdebug.h"
 #include "executor/nodeTidscan.h"
+#include "lib/arrayutils.h"
 #include "optimizer/clauses.h"
 #include "storage/bufmgr.h"
 #include "utils/array.h"
@@ -193,21 +194,13 @@ TidListCreate(TidScanState *tidstate)
 	 */
 	if (numTids > 1)
 	{
-		int			lastTid;
-		int			i;
-
 		/* CurrentOfExpr could never appear OR'd with something else */
 		Assert(!tidstate->tss_isCurrentOf);
 
 		qsort((void *) tidList, numTids, sizeof(ItemPointerData),
 			  itemptr_comparator);
-		lastTid = 0;
-		for (i = 1; i < numTids; i++)
-		{
-			if (!ItemPointerEquals(&tidList[lastTid], &tidList[i]))
-				tidList[++lastTid] = tidList[i];
-		}
-		numTids = lastTid + 1;
+		numTids = array_unique(tidList, numTids, sizeof(ItemPointerData),
+							   itemptr_comparator);
 	}
 
 	tidstate->tss_TidList = tidList;
diff --git a/src/backend/regex/regc_nfa.c b/src/backend/regex/regc_nfa.c
index cd9a323..405c539 100644
--- a/src/backend/regex/regc_nfa.c
+++ b/src/backend/regex/regc_nfa.c
@@ -36,6 +36,8 @@
  * the color chains.
  */
 
+#include "lib/arrayutils.h"
+
 #define NISERR()	VISERR(nfa->v)
 #define NERR(e)		VERR(nfa->v, (e))
 
@@ -914,7 +916,6 @@ mergeins(struct nfa * nfa,
 {
 	struct arc *na;
 	int			i;
-	int			j;
 
 	if (arccount <= 0)
 		return;
@@ -936,28 +937,9 @@ mergeins(struct nfa * nfa,
 
 	qsort(arcarray, arccount, sizeof(struct arc *), sortins_cmp);
 
-	/*
-	 * arcarray very likely includes dups, so we must eliminate them.  (This
-	 * could be folded into the next loop, but it's not worth the trouble.)
-	 */
-	j = 0;
-	for (i = 1; i < arccount; i++)
-	{
-		switch (sortins_cmp(&arcarray[j], &arcarray[i]))
-		{
-			case -1:
-				/* non-dup */
-				arcarray[++j] = arcarray[i];
-				break;
-			case 0:
-				/* dup */
-				break;
-			default:
-				/* trouble */
-				assert(NOTREACHED);
-		}
-	}
-	arccount = j + 1;
+	/* arcarray very likely includes dups, so we must eliminate them. */
+	arccount = array_unique(arcarray, arccount, sizeof(struct arc *),
+							sortins_cmp);
 
 	/*
 	 * Now merge into s' inchain.  Note that createarc() will put new arcs
diff --git a/src/backend/utils/adt/acl.c b/src/backend/utils/adt/acl.c
index 025a99e..6d39d77 100644
--- a/src/backend/utils/adt/acl.c
+++ b/src/backend/utils/adt/acl.c
@@ -28,6 +28,7 @@
 #include "commands/tablespace.h"
 #include "foreign/foreign.h"
 #include "funcapi.h"
+#include "lib/arrayutils.h"
 #include "miscadmin.h"
 #include "utils/acl.h"
 #include "utils/builtins.h"
@@ -1458,8 +1459,7 @@ aclmembers(const Acl *acl, Oid **roleids)
 	Oid		   *list;
 	const AclItem *acldat;
 	int			i,
-				j,
-				k;
+				j;
 
 	if (acl == NULL || ACL_NUM(acl) == 0)
 	{
@@ -1491,21 +1491,14 @@ aclmembers(const Acl *acl, Oid **roleids)
 	/* Sort the array */
 	qsort(list, j, sizeof(Oid), oidComparator);
 
-	/* Remove duplicates from the array */
-	k = 0;
-	for (i = 1; i < j; i++)
-	{
-		if (list[k] != list[i])
-			list[++k] = list[i];
-	}
-
 	/*
 	 * We could repalloc the array down to minimum size, but it's hardly worth
 	 * it since it's only transient memory.
 	 */
 	*roleids = list;
 
-	return k + 1;
+	/* Remove duplicates from the array */
+	return array_unique(list, j, sizeof(Oid), oidComparator);
 }
 
 /*
diff --git a/src/backend/utils/adt/tsgistidx.c b/src/backend/utils/adt/tsgistidx.c
index 6cdfb13..7474b8e 100644
--- a/src/backend/utils/adt/tsgistidx.c
+++ b/src/backend/utils/adt/tsgistidx.c
@@ -16,6 +16,7 @@
 
 #include "access/gist.h"
 #include "access/tuptoaster.h"
+#include "lib/arrayutils.h"
 #include "tsearch/ts_utils.h"
 #include "utils/pg_crc.h"
 
@@ -130,41 +131,16 @@ gtsvectorout(PG_FUNCTION_ARGS)
 }
 
 static int
-compareint(const void *va, const void *vb)
+compare_int(const void *va, const void *vb)
 {
-	int32		a = *((const int32 *) va);
-	int32		b = *((const int32 *) vb);
+	int			a = *((const int *) va);
+	int			b = *((const int *) vb);
 
 	if (a == b)
 		return 0;
 	return (a > b) ? 1 : -1;
 }
 
-/*
- * Removes duplicates from an array of int32. 'l' is
- * size of the input array. Returns the new size of the array.
- */
-static int
-uniqueint(int32 *a, int32 l)
-{
-	int32	   *ptr,
-			   *res;
-
-	if (l <= 1)
-		return l;
-
-	ptr = res = a;
-
-	qsort((void *) a, l, sizeof(int32), compareint);
-
-	while (ptr - a < l)
-		if (*ptr != *res)
-			*(++res) = *ptr++;
-		else
-			ptr++;
-	return res + 1 - a;
-}
-
 static void
 makesign(BITVECP sign, SignTSVector *a)
 {
@@ -211,7 +187,8 @@ gtsvector_compress(PG_FUNCTION_ARGS)
 			ptr++;
 		}
 
-		len = uniqueint(GETARR(res), val->size);
+		qsort(GETARR(res), val->size, sizeof(int), compare_int);
+		len = array_unique(GETARR(res), val->size, sizeof(int), compare_int);
 		if (len != val->size)
 		{
 			/*
diff --git a/src/backend/utils/adt/tsquery_op.c b/src/backend/utils/adt/tsquery_op.c
index a574b4b..915aaf5 100644
--- a/src/backend/utils/adt/tsquery_op.c
+++ b/src/backend/utils/adt/tsquery_op.c
@@ -14,6 +14,7 @@
 
 #include "postgres.h"
 
+#include "lib/arrayutils.h"
 #include "tsearch/ts_utils.h"
 
 Datum
@@ -301,29 +302,6 @@ cmp_string(const void *a, const void *b)
 	return strcmp(sa, sb);
 }
 
-static int
-remove_duplicates(char **strings, int n)
-{
-	if (n <= 1)
-		return n;
-	else
-	{
-		int			i;
-		char	   *prev = strings[0];
-		int			new_n = 1;
-
-		for (i = 1; i < n; i++)
-		{
-			if (strcmp(strings[i], prev) != 0)
-			{
-				strings[new_n++] = strings[i];
-				prev = strings[i];
-			}
-		}
-		return new_n;
-	}
-}
-
 Datum
 tsq_mcontains(PG_FUNCTION_ARGS)
 {
@@ -341,9 +319,11 @@ tsq_mcontains(PG_FUNCTION_ARGS)
 
 	/* Sort and remove duplicates from both arrays */
 	qsort(query_values, query_nvalues, sizeof(char *), cmp_string);
-	query_nvalues = remove_duplicates(query_values, query_nvalues);
+	query_nvalues = array_unique(query_values, query_nvalues, sizeof(char *),
+								 cmp_string);
 	qsort(ex_values, ex_nvalues, sizeof(char *), cmp_string);
-	ex_nvalues = remove_duplicates(ex_values, ex_nvalues);
+	ex_nvalues = array_unique(ex_values, ex_nvalues, sizeof(char *),
+							  cmp_string);
 
 	if (ex_nvalues > query_nvalues)
 		result = false;
diff --git a/src/backend/utils/adt/tsvector.c b/src/backend/utils/adt/tsvector.c
index 41bf3fb..af58210 100644
--- a/src/backend/utils/adt/tsvector.c
+++ b/src/backend/utils/adt/tsvector.c
@@ -40,8 +40,9 @@ compareWordEntryPos(const void *a, const void *b)
 }
 
 /*
- * Removes duplicate pos entries. If there's two entries with same pos
- * but different weight, the higher weight is retained.
+ * Removes duplicate pos entries. If there's two entries with same pos but
+ * different weight, the higher weight is retained, so we can't use
+ * array_unique here.
  *
  * Returns new length.
  */
diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c
index ad5a254..abbc680 100644
--- a/src/backend/utils/adt/tsvector_op.c
+++ b/src/backend/utils/adt/tsvector_op.c
@@ -20,6 +20,7 @@
 #include "commands/trigger.h"
 #include "executor/spi.h"
 #include "funcapi.h"
+#include "lib/arrayutils.h"
 #include "mb/pg_wchar.h"
 #include "miscadmin.h"
 #include "parser/parse_coerce.h"
@@ -474,16 +475,9 @@ tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
 	 */
 	if (indices_count > 1)
 	{
-		int			kp;
-
 		qsort(indices_to_delete, indices_count, sizeof(int), compare_int);
-		kp = 0;
-		for (k = 1; k < indices_count; k++)
-		{
-			if (indices_to_delete[k] != indices_to_delete[kp])
-				indices_to_delete[++kp] = indices_to_delete[k];
-		}
-		indices_count = ++kp;
+		indices_count = array_unique(indices_to_delete, indices_count,
+									 sizeof(int), compare_int);
 	}
 
 	/*
@@ -760,7 +754,6 @@ array_to_tsvector(PG_FUNCTION_ARGS)
 	bool	   *nulls;
 	int			nitems,
 				i,
-				j,
 				tslen,
 				datalen = 0;
 	char	   *cur;
@@ -780,13 +773,8 @@ array_to_tsvector(PG_FUNCTION_ARGS)
 	if (nitems > 1)
 	{
 		qsort(dlexemes, nitems, sizeof(Datum), compare_text_lexemes);
-		j = 0;
-		for (i = 1; i < nitems; i++)
-		{
-			if (compare_text_lexemes(&dlexemes[j], &dlexemes[i]) < 0)
-				dlexemes[++j] = dlexemes[i];
-		}
-		nitems = ++j;
+		nitems = array_unique(dlexemes, nitems, sizeof(Datum),
+							  compare_text_lexemes);
 	}
 
 	/* Calculate space needed for surviving lexemes. */
@@ -1270,39 +1258,6 @@ checkclass_str(CHKVAL *chkval, WordEntry *entry, QueryOperand *val,
 }
 
 /*
- * Removes duplicate pos entries. We can't use uniquePos() from
- * tsvector.c because array might be longer than MAXENTRYPOS
- *
- * Returns new length.
- */
-static int
-uniqueLongPos(WordEntryPos *pos, int npos)
-{
-	WordEntryPos *pos_iter,
-			   *result;
-
-	if (npos <= 1)
-		return npos;
-
-	qsort((void *) pos, npos, sizeof(WordEntryPos), compareWordEntryPos);
-
-	result = pos;
-	pos_iter = pos + 1;
-	while (pos_iter < pos + npos)
-	{
-		if (WEP_GETPOS(*pos_iter) != WEP_GETPOS(*result))
-		{
-			result++;
-			*result = WEP_GETPOS(*pos_iter);
-		}
-
-		pos_iter++;
-	}
-
-	return result + 1 - pos;
-}
-
-/*
  * is there value 'val' in array or not ?
  */
 static bool
@@ -1396,7 +1351,9 @@ checkcondition_str(void *checkval, QueryOperand *val, ExecPhraseData *data)
 		{
 			/* Sort and make unique array of found positions */
 			data->pos = allpos;
-			data->npos = uniqueLongPos(allpos, npos);
+			qsort(data->pos, npos, sizeof(WordEntryPos), compareWordEntryPos);
+			data->npos = array_unique(data->pos, npos, sizeof(WordEntryPos),
+									  compareWordEntryPos);
 			data->allocated = true;
 		}
 	}
diff --git a/src/backend/utils/cache/syscache.c b/src/backend/utils/cache/syscache.c
index 65ffe84..4c48b89 100644
--- a/src/backend/utils/cache/syscache.c
+++ b/src/backend/utils/cache/syscache.c
@@ -66,6 +66,7 @@
 #include "catalog/pg_ts_template.h"
 #include "catalog/pg_type.h"
 #include "catalog/pg_user_mapping.h"
+#include "lib/arrayutils.h"
 #include "utils/rel.h"
 #include "utils/catcache.h"
 #include "utils/syscache.h"
@@ -874,8 +875,6 @@ void
 InitCatalogCache(void)
 {
 	int			cacheId;
-	int			i,
-				j;
 
 	Assert(!CacheInitialized);
 
@@ -909,21 +908,15 @@ InitCatalogCache(void)
 	/* Sort and de-dup OID arrays, so we can use binary search. */
 	pg_qsort(SysCacheRelationOid, SysCacheRelationOidSize,
 			 sizeof(Oid), oid_compare);
-	for (i = 1, j = 0; i < SysCacheRelationOidSize; i++)
-	{
-		if (SysCacheRelationOid[i] != SysCacheRelationOid[j])
-			SysCacheRelationOid[++j] = SysCacheRelationOid[i];
-	}
-	SysCacheRelationOidSize = j + 1;
+	SysCacheRelationOidSize =
+		array_unique(SysCacheRelationOid, SysCacheRelationOidSize,
+					 sizeof(Oid), oid_compare);
 
 	pg_qsort(SysCacheSupportingRelOid, SysCacheSupportingRelOidSize,
 			 sizeof(Oid), oid_compare);
-	for (i = 1, j = 0; i < SysCacheSupportingRelOidSize; i++)
-	{
-		if (SysCacheSupportingRelOid[i] != SysCacheSupportingRelOid[j])
-			SysCacheSupportingRelOid[++j] = SysCacheSupportingRelOid[i];
-	}
-	SysCacheSupportingRelOidSize = j + 1;
+	SysCacheSupportingRelOidSize =
+		array_unique(SysCacheSupportingRelOid, SysCacheSupportingRelOidSize,
+					 sizeof(Oid), oid_compare);
 
 	CacheInitialized = true;
 }
diff --git a/src/include/lib/arrayutils.h b/src/include/lib/arrayutils.h
new file mode 100644
index 0000000..a65c67d
--- /dev/null
+++ b/src/include/lib/arrayutils.h
@@ -0,0 +1,63 @@
+/*-------------------------------------------------------------------------
+ *
+ * arrayutils.h
+ *		inline C-array utility functions
+ * Portions Copyright (c) 2016, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *		src/include/lib/arrayutils.h
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef ARRAYUTILS_H
+#define ARRAYUTILS_H
+
+/*
+ * Remove duplicates from a pre-sorted array, according to a user-supplied
+ * comparator.  Usually the array should have been sorted with qsort using the
+ * same arguments.  Returns the new size.
+ */
+static inline size_t
+array_unique(void *array, size_t elements, size_t width,
+			 int (*compare)(const void *, const void *))
+{
+	int i, j;
+	char *bytes = array;
+
+	if (elements <= 1)
+		return elements;
+
+	for (i = 1, j = 0; i < elements; ++i)
+	{
+		if (compare(bytes + i * width, bytes + j * width) != 0)
+			memcpy(bytes + ++j * width, bytes + i * width, width);
+	}
+
+	return j + 1;
+}
+
+/*
+ * Like array_unique, but takes a comparator with an extra user data argument
+ * which is passed through, for compatibility with qsort_arg.
+ */
+static inline size_t
+array_unique_arg(void *array, size_t elements, size_t width,
+				 int (*compare)(const void *, const void *, void *),
+				 void *arg)
+{
+	int i, j;
+	char *bytes = array;
+
+	if (elements <= 1)
+		return elements;
+
+	for (i = 1, j = 0; i < elements; ++i)
+	{
+		if (compare(bytes + i * width, bytes + j * width, arg) != 0)
+			memcpy(bytes + ++j * width, bytes + i * width, width);
+	}
+
+	return j + 1;
+}
+
+#endif
#2Tom Lane
tgl@sss.pgh.pa.us
In reply to: Thomas Munro (#1)
Re: Consolidate 'unique array values' logic into a reusable function?

Thomas Munro <thomas.munro@enterprisedb.com> writes:

Here's a sketch patch that creates a function array_unique which takes
the same arguments as qsort or qsort_arg and returns the new length.

Hmm ... I'd be against using this in backend/regex/, because I still
have hopes of converting that to a standalone library someday (and
in any case it needs to stay compatible with Tcl's copy of the code).
But otherwise this seems like a reasonable proposal.

As for the function name, maybe "qunique()" to go with "qsort()"?
I'm not thrilled with "array_unique" because that sounds like it
is meant for Postgres' array data types.

regards, tom lane

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#3Thomas Munro
thomas.munro@gmail.com
In reply to: Tom Lane (#2)
1 attachment(s)
Re: Consolidate 'unique array values' logic into a reusable function?

Hello,

I'm reviving a thread from 2016, because I wanted this thing again today.

Tom Lane <tgl@sss.pgh.pa.us> wrote:

Thomas Munro <thomas.munro@enterprisedb.com> writes:

Here's a sketch patch that creates a function array_unique which takes
the same arguments as qsort or qsort_arg and returns the new length.

Hmm ... I'd be against using this in backend/regex/, because I still
have hopes of converting that to a standalone library someday (and
in any case it needs to stay compatible with Tcl's copy of the code).
But otherwise this seems like a reasonable proposal.

As for the function name, maybe "qunique()" to go with "qsort()"?
I'm not thrilled with "array_unique" because that sounds like it
is meant for Postgres' array data types.

OK, here it is renamed to qunique() and qunique_arg(). It's a bit odd
because it has nothing to do with the quicksort algorithm, but make
some sense because it's always used with qsort(). I suppose we could
save a few more lines if there were a qsort_unique() function that
does both, since the arguments are identical. I also moved it into a
new header lib/qunique.h. Any better ideas for where it should live?
I removed the hunk under regex.

One thing I checked is that on my system it is inlined along with the
comparator when that is visible, so no performance should be lost by
throwing away the open coded versions. This makes me think that eg
oid_cmp() should probably be defined in a header; clearly we're also
carrying a few functions that should be consolidated into a new
int32_cmp() function, somewhere, too. (It might also be interesting
to use the pg_attribute_always_inline trick to instantiate some common
qsort() specialisations for a bit of speed-up, but that's another
topic.)

Adding to CF.

--
Thomas Munro
https://enterprisedb.com

Attachments:

0001-Consolidate-code-that-makes-a-sorted-array-unique.patchapplication/x-patch; name=0001-Consolidate-code-that-makes-a-sorted-array-unique.patchDownload
From b5fd67b9586cfdc3e66ccbdf019ec6d95f6206ec Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Fri, 30 Aug 2019 13:41:04 +1200
Subject: [PATCH] Consolidate code that makes a sorted array unique.

Introduce qunique() and qunique_arg() which can be used after qsort()
and qsort_arg() respectively to remove duplicate values.

Author: Thomas Munro
Reviewed-by: Tom Lane
Discussion: https://postgr.es/m/CAEepm%3D2vmFTNpAmwbGGD2WaryM6T3hSDVKQPfUwjdD_5XY6vAA%40mail.gmail.com
---
 contrib/hstore/hstore_io.c           |  5 +++
 contrib/intarray/_int_tool.c         | 19 +++-----
 contrib/pg_trgm/trgm_op.c            | 25 ++---------
 src/backend/access/nbtree/nbtutils.c | 19 ++------
 src/backend/executor/nodeTidscan.c   | 13 ++----
 src/backend/utils/adt/acl.c          | 15 ++-----
 src/backend/utils/adt/tsgistidx.c    | 29 ++-----------
 src/backend/utils/adt/tsquery_op.c   | 29 ++-----------
 src/backend/utils/adt/tsvector.c     |  5 ++-
 src/backend/utils/adt/tsvector_op.c  | 59 ++++---------------------
 src/backend/utils/cache/syscache.c   | 21 +++------
 src/include/lib/qunique.h            | 65 ++++++++++++++++++++++++++++
 12 files changed, 113 insertions(+), 191 deletions(-)
 create mode 100644 src/include/lib/qunique.h

diff --git a/contrib/hstore/hstore_io.c b/contrib/hstore/hstore_io.c
index 745497c76f..86d7c7c2d8 100644
--- a/contrib/hstore/hstore_io.c
+++ b/contrib/hstore/hstore_io.c
@@ -323,6 +323,11 @@ hstoreUniquePairs(Pairs *a, int32 l, int32 *buflen)
 	}
 
 	qsort((void *) a, l, sizeof(Pairs), comparePairs);
+
+	/*
+	 * We can't use qunique here because we have some clean-up code to run on
+	 * removed elements.
+	 */
 	ptr = a + 1;
 	res = a;
 	while (ptr - a < l)
diff --git a/contrib/intarray/_int_tool.c b/contrib/intarray/_int_tool.c
index fde8d15e2c..27d9ce7297 100644
--- a/contrib/intarray/_int_tool.c
+++ b/contrib/intarray/_int_tool.c
@@ -6,6 +6,7 @@
 #include <limits.h>
 
 #include "catalog/pg_type.h"
+#include "lib/qunique.h"
 
 #include "_int.h"
 
@@ -310,23 +311,13 @@ internal_size(int *a, int len)
 ArrayType *
 _int_unique(ArrayType *r)
 {
-	int		   *tmp,
-			   *dr,
-			   *data;
 	int			num = ARRNELEMS(r);
+	bool		duplicates_found;	/* not used */
 
-	if (num < 2)
-		return r;
+	num = qunique_arg(ARRPTR(r), num, sizeof(int), isort_cmp,
+					  &duplicates_found);
 
-	data = tmp = dr = ARRPTR(r);
-	while (tmp - data < num)
-	{
-		if (*tmp != *dr)
-			*(++dr) = *tmp++;
-		else
-			tmp++;
-	}
-	return resize_intArrayType(r, dr + 1 - ARRPTR(r));
+	return resize_intArrayType(r, num);
 }
 
 void
diff --git a/contrib/pg_trgm/trgm_op.c b/contrib/pg_trgm/trgm_op.c
index 0d4614e9c8..42fb625e7f 100644
--- a/contrib/pg_trgm/trgm_op.c
+++ b/contrib/pg_trgm/trgm_op.c
@@ -8,6 +8,7 @@
 #include "trgm.h"
 
 #include "catalog/pg_type.h"
+#include "lib/qunique.h"
 #include "tsearch/ts_locale.h"
 #include "utils/lsyscache.h"
 #include "utils/memutils.h"
@@ -163,26 +164,6 @@ comp_trgm(const void *a, const void *b)
 	return CMPTRGM(a, b);
 }
 
-static int
-unique_array(trgm *a, int len)
-{
-	trgm	   *curend,
-			   *tmp;
-
-	curend = tmp = a;
-	while (tmp - a < len)
-		if (CMPTRGM(tmp, curend))
-		{
-			curend++;
-			CPTRGM(curend, tmp);
-			tmp++;
-		}
-		else
-			tmp++;
-
-	return curend + 1 - a;
-}
-
 /*
  * Finds first word in string, returns pointer to the word,
  * endword points to the character after word
@@ -395,7 +376,7 @@ generate_trgm(char *str, int slen)
 	if (len > 1)
 	{
 		qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm);
-		len = unique_array(GETARR(trg), len);
+		len = qunique(GETARR(trg), len, sizeof(trgm), comp_trgm);
 	}
 
 	SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
@@ -943,7 +924,7 @@ generate_wildcard_trgm(const char *str, int slen)
 	if (len > 1)
 	{
 		qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm);
-		len = unique_array(GETARR(trg), len);
+		len = qunique(GETARR(trg), len, sizeof(trgm), comp_trgm);
 	}
 
 	SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 4c7b2d0966..662bab97fc 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -21,6 +21,7 @@
 #include "access/reloptions.h"
 #include "access/relscan.h"
 #include "commands/progress.h"
+#include "lib/qunique.h"
 #include "miscadmin.h"
 #include "utils/array.h"
 #include "utils/datum.h"
@@ -435,8 +436,6 @@ _bt_sort_array_elements(IndexScanDesc scan, ScanKey skey,
 	Oid			elemtype;
 	RegProcedure cmp_proc;
 	BTSortArrayContext cxt;
-	int			last_non_dup;
-	int			i;
 
 	if (nelems <= 1)
 		return nelems;			/* no work to do */
@@ -475,20 +474,8 @@ _bt_sort_array_elements(IndexScanDesc scan, ScanKey skey,
 			  _bt_compare_array_elements, (void *) &cxt);
 
 	/* Now scan the sorted elements and remove duplicates */
-	last_non_dup = 0;
-	for (i = 1; i < nelems; i++)
-	{
-		int32		compare;
-
-		compare = DatumGetInt32(FunctionCall2Coll(&cxt.flinfo,
-												  cxt.collation,
-												  elems[last_non_dup],
-												  elems[i]));
-		if (compare != 0)
-			elems[++last_non_dup] = elems[i];
-	}
-
-	return last_non_dup + 1;
+	return qunique_arg(elems, nelems, sizeof(Datum),
+					   _bt_compare_array_elements, &cxt);
 }
 
 /*
diff --git a/src/backend/executor/nodeTidscan.c b/src/backend/executor/nodeTidscan.c
index 8cf22d5bf0..a59480f906 100644
--- a/src/backend/executor/nodeTidscan.c
+++ b/src/backend/executor/nodeTidscan.c
@@ -27,6 +27,7 @@
 #include "catalog/pg_type.h"
 #include "executor/execdebug.h"
 #include "executor/nodeTidscan.h"
+#include "lib/qunique.h"
 #include "miscadmin.h"
 #include "nodes/nodeFuncs.h"
 #include "storage/bufmgr.h"
@@ -260,21 +261,13 @@ TidListEval(TidScanState *tidstate)
 	 */
 	if (numTids > 1)
 	{
-		int			lastTid;
-		int			i;
-
 		/* CurrentOfExpr could never appear OR'd with something else */
 		Assert(!tidstate->tss_isCurrentOf);
 
 		qsort((void *) tidList, numTids, sizeof(ItemPointerData),
 			  itemptr_comparator);
-		lastTid = 0;
-		for (i = 1; i < numTids; i++)
-		{
-			if (!ItemPointerEquals(&tidList[lastTid], &tidList[i]))
-				tidList[++lastTid] = tidList[i];
-		}
-		numTids = lastTid + 1;
+		numTids = qunique(tidList, numTids, sizeof(ItemPointerData),
+						  itemptr_comparator);
 	}
 
 	tidstate->tss_TidList = tidList;
diff --git a/src/backend/utils/adt/acl.c b/src/backend/utils/adt/acl.c
index d7e6100ccb..79bc750d77 100644
--- a/src/backend/utils/adt/acl.c
+++ b/src/backend/utils/adt/acl.c
@@ -28,6 +28,7 @@
 #include "commands/tablespace.h"
 #include "foreign/foreign.h"
 #include "funcapi.h"
+#include "lib/qunique.h"
 #include "miscadmin.h"
 #include "utils/acl.h"
 #include "utils/array.h"
@@ -1475,8 +1476,7 @@ aclmembers(const Acl *acl, Oid **roleids)
 	Oid		   *list;
 	const AclItem *acldat;
 	int			i,
-				j,
-				k;
+				j;
 
 	if (acl == NULL || ACL_NUM(acl) == 0)
 	{
@@ -1508,21 +1508,14 @@ aclmembers(const Acl *acl, Oid **roleids)
 	/* Sort the array */
 	qsort(list, j, sizeof(Oid), oid_cmp);
 
-	/* Remove duplicates from the array */
-	k = 0;
-	for (i = 1; i < j; i++)
-	{
-		if (list[k] != list[i])
-			list[++k] = list[i];
-	}
-
 	/*
 	 * We could repalloc the array down to minimum size, but it's hardly worth
 	 * it since it's only transient memory.
 	 */
 	*roleids = list;
 
-	return k + 1;
+	/* Remove duplicates from the array */
+	return qunique(list, j, sizeof(Oid), oid_cmp);
 }
 
 
diff --git a/src/backend/utils/adt/tsgistidx.c b/src/backend/utils/adt/tsgistidx.c
index 4f256260fd..44defa9de1 100644
--- a/src/backend/utils/adt/tsgistidx.c
+++ b/src/backend/utils/adt/tsgistidx.c
@@ -16,6 +16,7 @@
 
 #include "access/gist.h"
 #include "access/tuptoaster.h"
+#include "lib/qunique.h"
 #include "port/pg_bitutils.h"
 #include "tsearch/ts_utils.h"
 #include "utils/builtins.h"
@@ -122,31 +123,6 @@ compareint(const void *va, const void *vb)
 	return (a > b) ? 1 : -1;
 }
 
-/*
- * Removes duplicates from an array of int32. 'l' is
- * size of the input array. Returns the new size of the array.
- */
-static int
-uniqueint(int32 *a, int32 l)
-{
-	int32	   *ptr,
-			   *res;
-
-	if (l <= 1)
-		return l;
-
-	ptr = res = a;
-
-	qsort((void *) a, l, sizeof(int32), compareint);
-
-	while (ptr - a < l)
-		if (*ptr != *res)
-			*(++res) = *ptr++;
-		else
-			ptr++;
-	return res + 1 - a;
-}
-
 static void
 makesign(BITVECP sign, SignTSVector *a)
 {
@@ -193,7 +169,8 @@ gtsvector_compress(PG_FUNCTION_ARGS)
 			ptr++;
 		}
 
-		len = uniqueint(GETARR(res), val->size);
+		qsort(GETARR(res), val->size, sizeof(int), compareint);
+		len = qunique(GETARR(res), val->size, sizeof(int), compareint);
 		if (len != val->size)
 		{
 			/*
diff --git a/src/backend/utils/adt/tsquery_op.c b/src/backend/utils/adt/tsquery_op.c
index 1f63d9b6a9..e6816940ad 100644
--- a/src/backend/utils/adt/tsquery_op.c
+++ b/src/backend/utils/adt/tsquery_op.c
@@ -14,6 +14,7 @@
 
 #include "postgres.h"
 
+#include "lib/qunique.h"
 #include "tsearch/ts_utils.h"
 #include "utils/builtins.h"
 
@@ -302,29 +303,6 @@ cmp_string(const void *a, const void *b)
 	return strcmp(sa, sb);
 }
 
-static int
-remove_duplicates(char **strings, int n)
-{
-	if (n <= 1)
-		return n;
-	else
-	{
-		int			i;
-		char	   *prev = strings[0];
-		int			new_n = 1;
-
-		for (i = 1; i < n; i++)
-		{
-			if (strcmp(strings[i], prev) != 0)
-			{
-				strings[new_n++] = strings[i];
-				prev = strings[i];
-			}
-		}
-		return new_n;
-	}
-}
-
 Datum
 tsq_mcontains(PG_FUNCTION_ARGS)
 {
@@ -342,9 +320,10 @@ tsq_mcontains(PG_FUNCTION_ARGS)
 
 	/* Sort and remove duplicates from both arrays */
 	qsort(query_values, query_nvalues, sizeof(char *), cmp_string);
-	query_nvalues = remove_duplicates(query_values, query_nvalues);
+	query_nvalues = qunique(query_values, query_nvalues, sizeof(char *),
+							cmp_string);
 	qsort(ex_values, ex_nvalues, sizeof(char *), cmp_string);
-	ex_nvalues = remove_duplicates(ex_values, ex_nvalues);
+	ex_nvalues = qunique(ex_values, ex_nvalues, sizeof(char *), cmp_string);
 
 	if (ex_nvalues > query_nvalues)
 		result = false;
diff --git a/src/backend/utils/adt/tsvector.c b/src/backend/utils/adt/tsvector.c
index ccfc4147fa..098eaed3e5 100644
--- a/src/backend/utils/adt/tsvector.c
+++ b/src/backend/utils/adt/tsvector.c
@@ -41,8 +41,9 @@ compareWordEntryPos(const void *a, const void *b)
 }
 
 /*
- * Removes duplicate pos entries. If there's two entries with same pos
- * but different weight, the higher weight is retained.
+ * Removes duplicate pos entries. If there's two entries with same pos but
+ * different weight, the higher weight is retained, so we can't use
+ * qunique here.
  *
  * Returns new length.
  */
diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c
index 28d042273e..f483c4b228 100644
--- a/src/backend/utils/adt/tsvector_op.c
+++ b/src/backend/utils/adt/tsvector_op.c
@@ -21,6 +21,7 @@
 #include "commands/trigger.h"
 #include "executor/spi.h"
 #include "funcapi.h"
+#include "lib/qunique.h"
 #include "mb/pg_wchar.h"
 #include "miscadmin.h"
 #include "parser/parse_coerce.h"
@@ -475,16 +476,9 @@ tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
 	 */
 	if (indices_count > 1)
 	{
-		int			kp;
-
 		qsort(indices_to_delete, indices_count, sizeof(int), compare_int);
-		kp = 0;
-		for (k = 1; k < indices_count; k++)
-		{
-			if (indices_to_delete[k] != indices_to_delete[kp])
-				indices_to_delete[++kp] = indices_to_delete[k];
-		}
-		indices_count = ++kp;
+		indices_count = qunique(indices_to_delete, indices_count, sizeof(int),
+								compare_int);
 	}
 
 	/*
@@ -761,7 +755,6 @@ array_to_tsvector(PG_FUNCTION_ARGS)
 	bool	   *nulls;
 	int			nitems,
 				i,
-				j,
 				tslen,
 				datalen = 0;
 	char	   *cur;
@@ -781,13 +774,8 @@ array_to_tsvector(PG_FUNCTION_ARGS)
 	if (nitems > 1)
 	{
 		qsort(dlexemes, nitems, sizeof(Datum), compare_text_lexemes);
-		j = 0;
-		for (i = 1; i < nitems; i++)
-		{
-			if (compare_text_lexemes(&dlexemes[j], &dlexemes[i]) < 0)
-				dlexemes[++j] = dlexemes[i];
-		}
-		nitems = ++j;
+		nitems = qunique(dlexemes, nitems, sizeof(Datum),
+						 compare_text_lexemes);
 	}
 
 	/* Calculate space needed for surviving lexemes. */
@@ -1270,39 +1258,6 @@ checkclass_str(CHKVAL *chkval, WordEntry *entry, QueryOperand *val,
 	return result;
 }
 
-/*
- * Removes duplicate pos entries. We can't use uniquePos() from
- * tsvector.c because array might be longer than MAXENTRYPOS
- *
- * Returns new length.
- */
-static int
-uniqueLongPos(WordEntryPos *pos, int npos)
-{
-	WordEntryPos *pos_iter,
-			   *result;
-
-	if (npos <= 1)
-		return npos;
-
-	qsort((void *) pos, npos, sizeof(WordEntryPos), compareWordEntryPos);
-
-	result = pos;
-	pos_iter = pos + 1;
-	while (pos_iter < pos + npos)
-	{
-		if (WEP_GETPOS(*pos_iter) != WEP_GETPOS(*result))
-		{
-			result++;
-			*result = WEP_GETPOS(*pos_iter);
-		}
-
-		pos_iter++;
-	}
-
-	return result + 1 - pos;
-}
-
 /*
  * is there value 'val' in array or not ?
  */
@@ -1397,7 +1352,9 @@ checkcondition_str(void *checkval, QueryOperand *val, ExecPhraseData *data)
 		{
 			/* Sort and make unique array of found positions */
 			data->pos = allpos;
-			data->npos = uniqueLongPos(allpos, npos);
+			qsort(data->pos, npos, sizeof(WordEntryPos), compareWordEntryPos);
+			data->npos = qunique(data->pos, npos, sizeof(WordEntryPos),
+								 compareWordEntryPos);
 			data->allocated = true;
 		}
 	}
diff --git a/src/backend/utils/cache/syscache.c b/src/backend/utils/cache/syscache.c
index 16297a52a1..36aee74ab0 100644
--- a/src/backend/utils/cache/syscache.c
+++ b/src/backend/utils/cache/syscache.c
@@ -74,6 +74,7 @@
 #include "catalog/pg_ts_template.h"
 #include "catalog/pg_type.h"
 #include "catalog/pg_user_mapping.h"
+#include "lib/qunique.h"
 #include "utils/rel.h"
 #include "utils/catcache.h"
 #include "utils/syscache.h"
@@ -1010,8 +1011,6 @@ void
 InitCatalogCache(void)
 {
 	int			cacheId;
-	int			i,
-				j;
 
 	StaticAssertStmt(SysCacheSize == (int) lengthof(cacheinfo),
 					 "SysCacheSize does not match syscache.c's array");
@@ -1048,21 +1047,15 @@ InitCatalogCache(void)
 	/* Sort and de-dup OID arrays, so we can use binary search. */
 	pg_qsort(SysCacheRelationOid, SysCacheRelationOidSize,
 			 sizeof(Oid), oid_compare);
-	for (i = 1, j = 0; i < SysCacheRelationOidSize; i++)
-	{
-		if (SysCacheRelationOid[i] != SysCacheRelationOid[j])
-			SysCacheRelationOid[++j] = SysCacheRelationOid[i];
-	}
-	SysCacheRelationOidSize = j + 1;
+	SysCacheRelationOidSize =
+		qunique(SysCacheRelationOid, SysCacheRelationOidSize, sizeof(Oid),
+				oid_compare);
 
 	pg_qsort(SysCacheSupportingRelOid, SysCacheSupportingRelOidSize,
 			 sizeof(Oid), oid_compare);
-	for (i = 1, j = 0; i < SysCacheSupportingRelOidSize; i++)
-	{
-		if (SysCacheSupportingRelOid[i] != SysCacheSupportingRelOid[j])
-			SysCacheSupportingRelOid[++j] = SysCacheSupportingRelOid[i];
-	}
-	SysCacheSupportingRelOidSize = j + 1;
+	SysCacheSupportingRelOidSize =
+		qunique(SysCacheSupportingRelOid, SysCacheSupportingRelOidSize,
+				sizeof(Oid), oid_compare);
 
 	CacheInitialized = true;
 }
diff --git a/src/include/lib/qunique.h b/src/include/lib/qunique.h
new file mode 100644
index 0000000000..4d620f82ee
--- /dev/null
+++ b/src/include/lib/qunique.h
@@ -0,0 +1,65 @@
+/*-------------------------------------------------------------------------
+ *
+ * qunique.h
+ *		inline array unique functions
+ * Portions Copyright (c) 2019, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *		src/include/lib/qunique.h
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef QUNIQUE_H
+#define QUNIQUE_H
+
+/*
+ * Remove duplicates from a pre-sorted array, according to a user-supplied
+ * comparator.  Usually the array should have been sorted with qsort() using
+ * the same arguments.  Return the new size.
+ */
+static inline size_t
+qunique(void *array, size_t elements, size_t width,
+		int (*compare) (const void *, const void *))
+{
+	char	   *bytes = (char *) array;
+	size_t		i,
+				j;
+
+	if (elements <= 1)
+		return elements;
+
+	for (i = 1, j = 0; i < elements; ++i)
+	{
+		if (compare(bytes + i * width, bytes + j * width) != 0)
+			memcpy(bytes + ++j * width, bytes + i * width, width);
+	}
+
+	return j + 1;
+}
+
+/*
+ * Like qunique(), but takes a comparator with an extra user data argument
+ * which is passed through, for compatibility with qsort_arg().
+ */
+static inline size_t
+qunique_arg(void *array, size_t elements, size_t width,
+			int (*compare) (const void *, const void *, void *),
+			void *arg)
+{
+	char	   *bytes = (char *) array;
+	size_t		i,
+				j;
+
+	if (elements <= 1)
+		return elements;
+
+	for (i = 1, j = 0; i < elements; ++i)
+	{
+		if (compare(bytes + i * width, bytes + j * width, arg) != 0)
+			memcpy(bytes + ++j * width, bytes + i * width, width);
+	}
+
+	return j + 1;
+}
+
+#endif							/* QUNIQUE_H */
-- 
2.22.0

#4Thomas Munro
thomas.munro@gmail.com
In reply to: Thomas Munro (#3)
1 attachment(s)
Re: Consolidate 'unique array values' logic into a reusable function?

On Fri, Aug 30, 2019 at 3:34 PM Thomas Munro <thomas.munro@gmail.com> wrote:

Adding to CF.

Rebased due to bitrot. Spotted one more place to use this, in
src/backend/utils/adt/txid.c.

--
Thomas Munro
https://enterprisedb.com

Attachments:

0001-Consolidate-code-that-makes-a-sorted-array-unique-v2.patchapplication/octet-stream; name=0001-Consolidate-code-that-makes-a-sorted-array-unique-v2.patchDownload
From 983b525399c0ed0e109d6847c43f38783d062b62 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Fri, 30 Aug 2019 13:41:04 +1200
Subject: [PATCH] Consolidate code that makes a sorted array unique.

Introduce qunique() and qunique_arg() which can be used after qsort()
and qsort_arg() respectively to remove duplicate values.

Author: Thomas Munro
Reviewed-by: Tom Lane
Discussion: https://postgr.es/m/CAEepm%3D2vmFTNpAmwbGGD2WaryM6T3hSDVKQPfUwjdD_5XY6vAA%40mail.gmail.com
---
 contrib/hstore/hstore_io.c           |  5 +++
 contrib/intarray/_int_tool.c         | 19 +++-----
 contrib/pg_trgm/trgm_op.c            | 25 ++---------
 src/backend/access/nbtree/nbtutils.c | 19 ++------
 src/backend/executor/nodeTidscan.c   | 13 ++----
 src/backend/utils/adt/acl.c          | 15 ++-----
 src/backend/utils/adt/tsgistidx.c    | 29 ++-----------
 src/backend/utils/adt/tsquery_op.c   | 29 ++-----------
 src/backend/utils/adt/tsvector.c     |  5 ++-
 src/backend/utils/adt/tsvector_op.c  | 59 ++++---------------------
 src/backend/utils/adt/txid.c         | 19 +-------
 src/backend/utils/cache/syscache.c   | 21 +++------
 src/include/lib/qunique.h            | 65 ++++++++++++++++++++++++++++
 13 files changed, 115 insertions(+), 208 deletions(-)
 create mode 100644 src/include/lib/qunique.h

diff --git a/contrib/hstore/hstore_io.c b/contrib/hstore/hstore_io.c
index 745497c76f..86d7c7c2d8 100644
--- a/contrib/hstore/hstore_io.c
+++ b/contrib/hstore/hstore_io.c
@@ -323,6 +323,11 @@ hstoreUniquePairs(Pairs *a, int32 l, int32 *buflen)
 	}
 
 	qsort((void *) a, l, sizeof(Pairs), comparePairs);
+
+	/*
+	 * We can't use qunique here because we have some clean-up code to run on
+	 * removed elements.
+	 */
 	ptr = a + 1;
 	res = a;
 	while (ptr - a < l)
diff --git a/contrib/intarray/_int_tool.c b/contrib/intarray/_int_tool.c
index fde8d15e2c..27d9ce7297 100644
--- a/contrib/intarray/_int_tool.c
+++ b/contrib/intarray/_int_tool.c
@@ -6,6 +6,7 @@
 #include <limits.h>
 
 #include "catalog/pg_type.h"
+#include "lib/qunique.h"
 
 #include "_int.h"
 
@@ -310,23 +311,13 @@ internal_size(int *a, int len)
 ArrayType *
 _int_unique(ArrayType *r)
 {
-	int		   *tmp,
-			   *dr,
-			   *data;
 	int			num = ARRNELEMS(r);
+	bool		duplicates_found;	/* not used */
 
-	if (num < 2)
-		return r;
+	num = qunique_arg(ARRPTR(r), num, sizeof(int), isort_cmp,
+					  &duplicates_found);
 
-	data = tmp = dr = ARRPTR(r);
-	while (tmp - data < num)
-	{
-		if (*tmp != *dr)
-			*(++dr) = *tmp++;
-		else
-			tmp++;
-	}
-	return resize_intArrayType(r, dr + 1 - ARRPTR(r));
+	return resize_intArrayType(r, num);
 }
 
 void
diff --git a/contrib/pg_trgm/trgm_op.c b/contrib/pg_trgm/trgm_op.c
index 0d4614e9c8..42fb625e7f 100644
--- a/contrib/pg_trgm/trgm_op.c
+++ b/contrib/pg_trgm/trgm_op.c
@@ -8,6 +8,7 @@
 #include "trgm.h"
 
 #include "catalog/pg_type.h"
+#include "lib/qunique.h"
 #include "tsearch/ts_locale.h"
 #include "utils/lsyscache.h"
 #include "utils/memutils.h"
@@ -163,26 +164,6 @@ comp_trgm(const void *a, const void *b)
 	return CMPTRGM(a, b);
 }
 
-static int
-unique_array(trgm *a, int len)
-{
-	trgm	   *curend,
-			   *tmp;
-
-	curend = tmp = a;
-	while (tmp - a < len)
-		if (CMPTRGM(tmp, curend))
-		{
-			curend++;
-			CPTRGM(curend, tmp);
-			tmp++;
-		}
-		else
-			tmp++;
-
-	return curend + 1 - a;
-}
-
 /*
  * Finds first word in string, returns pointer to the word,
  * endword points to the character after word
@@ -395,7 +376,7 @@ generate_trgm(char *str, int slen)
 	if (len > 1)
 	{
 		qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm);
-		len = unique_array(GETARR(trg), len);
+		len = qunique(GETARR(trg), len, sizeof(trgm), comp_trgm);
 	}
 
 	SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
@@ -943,7 +924,7 @@ generate_wildcard_trgm(const char *str, int slen)
 	if (len > 1)
 	{
 		qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm);
-		len = unique_array(GETARR(trg), len);
+		len = qunique(GETARR(trg), len, sizeof(trgm), comp_trgm);
 	}
 
 	SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 4c7b2d0966..662bab97fc 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -21,6 +21,7 @@
 #include "access/reloptions.h"
 #include "access/relscan.h"
 #include "commands/progress.h"
+#include "lib/qunique.h"
 #include "miscadmin.h"
 #include "utils/array.h"
 #include "utils/datum.h"
@@ -435,8 +436,6 @@ _bt_sort_array_elements(IndexScanDesc scan, ScanKey skey,
 	Oid			elemtype;
 	RegProcedure cmp_proc;
 	BTSortArrayContext cxt;
-	int			last_non_dup;
-	int			i;
 
 	if (nelems <= 1)
 		return nelems;			/* no work to do */
@@ -475,20 +474,8 @@ _bt_sort_array_elements(IndexScanDesc scan, ScanKey skey,
 			  _bt_compare_array_elements, (void *) &cxt);
 
 	/* Now scan the sorted elements and remove duplicates */
-	last_non_dup = 0;
-	for (i = 1; i < nelems; i++)
-	{
-		int32		compare;
-
-		compare = DatumGetInt32(FunctionCall2Coll(&cxt.flinfo,
-												  cxt.collation,
-												  elems[last_non_dup],
-												  elems[i]));
-		if (compare != 0)
-			elems[++last_non_dup] = elems[i];
-	}
-
-	return last_non_dup + 1;
+	return qunique_arg(elems, nelems, sizeof(Datum),
+					   _bt_compare_array_elements, &cxt);
 }
 
 /*
diff --git a/src/backend/executor/nodeTidscan.c b/src/backend/executor/nodeTidscan.c
index 8cf22d5bf0..a59480f906 100644
--- a/src/backend/executor/nodeTidscan.c
+++ b/src/backend/executor/nodeTidscan.c
@@ -27,6 +27,7 @@
 #include "catalog/pg_type.h"
 #include "executor/execdebug.h"
 #include "executor/nodeTidscan.h"
+#include "lib/qunique.h"
 #include "miscadmin.h"
 #include "nodes/nodeFuncs.h"
 #include "storage/bufmgr.h"
@@ -260,21 +261,13 @@ TidListEval(TidScanState *tidstate)
 	 */
 	if (numTids > 1)
 	{
-		int			lastTid;
-		int			i;
-
 		/* CurrentOfExpr could never appear OR'd with something else */
 		Assert(!tidstate->tss_isCurrentOf);
 
 		qsort((void *) tidList, numTids, sizeof(ItemPointerData),
 			  itemptr_comparator);
-		lastTid = 0;
-		for (i = 1; i < numTids; i++)
-		{
-			if (!ItemPointerEquals(&tidList[lastTid], &tidList[i]))
-				tidList[++lastTid] = tidList[i];
-		}
-		numTids = lastTid + 1;
+		numTids = qunique(tidList, numTids, sizeof(ItemPointerData),
+						  itemptr_comparator);
 	}
 
 	tidstate->tss_TidList = tidList;
diff --git a/src/backend/utils/adt/acl.c b/src/backend/utils/adt/acl.c
index d7e6100ccb..79bc750d77 100644
--- a/src/backend/utils/adt/acl.c
+++ b/src/backend/utils/adt/acl.c
@@ -28,6 +28,7 @@
 #include "commands/tablespace.h"
 #include "foreign/foreign.h"
 #include "funcapi.h"
+#include "lib/qunique.h"
 #include "miscadmin.h"
 #include "utils/acl.h"
 #include "utils/array.h"
@@ -1475,8 +1476,7 @@ aclmembers(const Acl *acl, Oid **roleids)
 	Oid		   *list;
 	const AclItem *acldat;
 	int			i,
-				j,
-				k;
+				j;
 
 	if (acl == NULL || ACL_NUM(acl) == 0)
 	{
@@ -1508,21 +1508,14 @@ aclmembers(const Acl *acl, Oid **roleids)
 	/* Sort the array */
 	qsort(list, j, sizeof(Oid), oid_cmp);
 
-	/* Remove duplicates from the array */
-	k = 0;
-	for (i = 1; i < j; i++)
-	{
-		if (list[k] != list[i])
-			list[++k] = list[i];
-	}
-
 	/*
 	 * We could repalloc the array down to minimum size, but it's hardly worth
 	 * it since it's only transient memory.
 	 */
 	*roleids = list;
 
-	return k + 1;
+	/* Remove duplicates from the array */
+	return qunique(list, j, sizeof(Oid), oid_cmp);
 }
 
 
diff --git a/src/backend/utils/adt/tsgistidx.c b/src/backend/utils/adt/tsgistidx.c
index 6ff71a49b8..af5b6809c5 100644
--- a/src/backend/utils/adt/tsgistidx.c
+++ b/src/backend/utils/adt/tsgistidx.c
@@ -16,6 +16,7 @@
 
 #include "access/gist.h"
 #include "access/heaptoast.h"
+#include "lib/qunique.h"
 #include "port/pg_bitutils.h"
 #include "tsearch/ts_utils.h"
 #include "utils/builtins.h"
@@ -122,31 +123,6 @@ compareint(const void *va, const void *vb)
 	return (a > b) ? 1 : -1;
 }
 
-/*
- * Removes duplicates from an array of int32. 'l' is
- * size of the input array. Returns the new size of the array.
- */
-static int
-uniqueint(int32 *a, int32 l)
-{
-	int32	   *ptr,
-			   *res;
-
-	if (l <= 1)
-		return l;
-
-	ptr = res = a;
-
-	qsort((void *) a, l, sizeof(int32), compareint);
-
-	while (ptr - a < l)
-		if (*ptr != *res)
-			*(++res) = *ptr++;
-		else
-			ptr++;
-	return res + 1 - a;
-}
-
 static void
 makesign(BITVECP sign, SignTSVector *a)
 {
@@ -193,7 +169,8 @@ gtsvector_compress(PG_FUNCTION_ARGS)
 			ptr++;
 		}
 
-		len = uniqueint(GETARR(res), val->size);
+		qsort(GETARR(res), val->size, sizeof(int), compareint);
+		len = qunique(GETARR(res), val->size, sizeof(int), compareint);
 		if (len != val->size)
 		{
 			/*
diff --git a/src/backend/utils/adt/tsquery_op.c b/src/backend/utils/adt/tsquery_op.c
index 1f63d9b6a9..e6816940ad 100644
--- a/src/backend/utils/adt/tsquery_op.c
+++ b/src/backend/utils/adt/tsquery_op.c
@@ -14,6 +14,7 @@
 
 #include "postgres.h"
 
+#include "lib/qunique.h"
 #include "tsearch/ts_utils.h"
 #include "utils/builtins.h"
 
@@ -302,29 +303,6 @@ cmp_string(const void *a, const void *b)
 	return strcmp(sa, sb);
 }
 
-static int
-remove_duplicates(char **strings, int n)
-{
-	if (n <= 1)
-		return n;
-	else
-	{
-		int			i;
-		char	   *prev = strings[0];
-		int			new_n = 1;
-
-		for (i = 1; i < n; i++)
-		{
-			if (strcmp(strings[i], prev) != 0)
-			{
-				strings[new_n++] = strings[i];
-				prev = strings[i];
-			}
-		}
-		return new_n;
-	}
-}
-
 Datum
 tsq_mcontains(PG_FUNCTION_ARGS)
 {
@@ -342,9 +320,10 @@ tsq_mcontains(PG_FUNCTION_ARGS)
 
 	/* Sort and remove duplicates from both arrays */
 	qsort(query_values, query_nvalues, sizeof(char *), cmp_string);
-	query_nvalues = remove_duplicates(query_values, query_nvalues);
+	query_nvalues = qunique(query_values, query_nvalues, sizeof(char *),
+							cmp_string);
 	qsort(ex_values, ex_nvalues, sizeof(char *), cmp_string);
-	ex_nvalues = remove_duplicates(ex_values, ex_nvalues);
+	ex_nvalues = qunique(ex_values, ex_nvalues, sizeof(char *), cmp_string);
 
 	if (ex_nvalues > query_nvalues)
 		result = false;
diff --git a/src/backend/utils/adt/tsvector.c b/src/backend/utils/adt/tsvector.c
index ccfc4147fa..098eaed3e5 100644
--- a/src/backend/utils/adt/tsvector.c
+++ b/src/backend/utils/adt/tsvector.c
@@ -41,8 +41,9 @@ compareWordEntryPos(const void *a, const void *b)
 }
 
 /*
- * Removes duplicate pos entries. If there's two entries with same pos
- * but different weight, the higher weight is retained.
+ * Removes duplicate pos entries. If there's two entries with same pos but
+ * different weight, the higher weight is retained, so we can't use
+ * qunique here.
  *
  * Returns new length.
  */
diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c
index 28d042273e..f483c4b228 100644
--- a/src/backend/utils/adt/tsvector_op.c
+++ b/src/backend/utils/adt/tsvector_op.c
@@ -21,6 +21,7 @@
 #include "commands/trigger.h"
 #include "executor/spi.h"
 #include "funcapi.h"
+#include "lib/qunique.h"
 #include "mb/pg_wchar.h"
 #include "miscadmin.h"
 #include "parser/parse_coerce.h"
@@ -475,16 +476,9 @@ tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
 	 */
 	if (indices_count > 1)
 	{
-		int			kp;
-
 		qsort(indices_to_delete, indices_count, sizeof(int), compare_int);
-		kp = 0;
-		for (k = 1; k < indices_count; k++)
-		{
-			if (indices_to_delete[k] != indices_to_delete[kp])
-				indices_to_delete[++kp] = indices_to_delete[k];
-		}
-		indices_count = ++kp;
+		indices_count = qunique(indices_to_delete, indices_count, sizeof(int),
+								compare_int);
 	}
 
 	/*
@@ -761,7 +755,6 @@ array_to_tsvector(PG_FUNCTION_ARGS)
 	bool	   *nulls;
 	int			nitems,
 				i,
-				j,
 				tslen,
 				datalen = 0;
 	char	   *cur;
@@ -781,13 +774,8 @@ array_to_tsvector(PG_FUNCTION_ARGS)
 	if (nitems > 1)
 	{
 		qsort(dlexemes, nitems, sizeof(Datum), compare_text_lexemes);
-		j = 0;
-		for (i = 1; i < nitems; i++)
-		{
-			if (compare_text_lexemes(&dlexemes[j], &dlexemes[i]) < 0)
-				dlexemes[++j] = dlexemes[i];
-		}
-		nitems = ++j;
+		nitems = qunique(dlexemes, nitems, sizeof(Datum),
+						 compare_text_lexemes);
 	}
 
 	/* Calculate space needed for surviving lexemes. */
@@ -1270,39 +1258,6 @@ checkclass_str(CHKVAL *chkval, WordEntry *entry, QueryOperand *val,
 	return result;
 }
 
-/*
- * Removes duplicate pos entries. We can't use uniquePos() from
- * tsvector.c because array might be longer than MAXENTRYPOS
- *
- * Returns new length.
- */
-static int
-uniqueLongPos(WordEntryPos *pos, int npos)
-{
-	WordEntryPos *pos_iter,
-			   *result;
-
-	if (npos <= 1)
-		return npos;
-
-	qsort((void *) pos, npos, sizeof(WordEntryPos), compareWordEntryPos);
-
-	result = pos;
-	pos_iter = pos + 1;
-	while (pos_iter < pos + npos)
-	{
-		if (WEP_GETPOS(*pos_iter) != WEP_GETPOS(*result))
-		{
-			result++;
-			*result = WEP_GETPOS(*pos_iter);
-		}
-
-		pos_iter++;
-	}
-
-	return result + 1 - pos;
-}
-
 /*
  * is there value 'val' in array or not ?
  */
@@ -1397,7 +1352,9 @@ checkcondition_str(void *checkval, QueryOperand *val, ExecPhraseData *data)
 		{
 			/* Sort and make unique array of found positions */
 			data->pos = allpos;
-			data->npos = uniqueLongPos(allpos, npos);
+			qsort(data->pos, npos, sizeof(WordEntryPos), compareWordEntryPos);
+			data->npos = qunique(data->pos, npos, sizeof(WordEntryPos),
+								 compareWordEntryPos);
 			data->allocated = true;
 		}
 	}
diff --git a/src/backend/utils/adt/txid.c b/src/backend/utils/adt/txid.c
index 90b2c9b694..e220c5f136 100644
--- a/src/backend/utils/adt/txid.c
+++ b/src/backend/utils/adt/txid.c
@@ -27,6 +27,7 @@
 #include "access/xlog.h"
 #include "funcapi.h"
 #include "miscadmin.h"
+#include "lib/qunique.h"
 #include "libpq/pqformat.h"
 #include "postmaster/postmaster.h"
 #include "storage/lwlock.h"
@@ -213,26 +214,10 @@ cmp_txid(const void *aa, const void *bb)
 static void
 sort_snapshot(TxidSnapshot *snap)
 {
-	txid		last = 0;
-	int			nxip,
-				idx1,
-				idx2;
-
 	if (snap->nxip > 1)
 	{
 		qsort(snap->xip, snap->nxip, sizeof(txid), cmp_txid);
-
-		/* remove duplicates */
-		nxip = snap->nxip;
-		idx1 = idx2 = 0;
-		while (idx1 < nxip)
-		{
-			if (snap->xip[idx1] != last)
-				last = snap->xip[idx2++] = snap->xip[idx1];
-			else
-				snap->nxip--;
-			idx1++;
-		}
+		snap->nxip = qunique(snap->xip, snap->nxip, sizeof(txid), cmp_txid);
 	}
 }
 
diff --git a/src/backend/utils/cache/syscache.c b/src/backend/utils/cache/syscache.c
index 16297a52a1..36aee74ab0 100644
--- a/src/backend/utils/cache/syscache.c
+++ b/src/backend/utils/cache/syscache.c
@@ -74,6 +74,7 @@
 #include "catalog/pg_ts_template.h"
 #include "catalog/pg_type.h"
 #include "catalog/pg_user_mapping.h"
+#include "lib/qunique.h"
 #include "utils/rel.h"
 #include "utils/catcache.h"
 #include "utils/syscache.h"
@@ -1010,8 +1011,6 @@ void
 InitCatalogCache(void)
 {
 	int			cacheId;
-	int			i,
-				j;
 
 	StaticAssertStmt(SysCacheSize == (int) lengthof(cacheinfo),
 					 "SysCacheSize does not match syscache.c's array");
@@ -1048,21 +1047,15 @@ InitCatalogCache(void)
 	/* Sort and de-dup OID arrays, so we can use binary search. */
 	pg_qsort(SysCacheRelationOid, SysCacheRelationOidSize,
 			 sizeof(Oid), oid_compare);
-	for (i = 1, j = 0; i < SysCacheRelationOidSize; i++)
-	{
-		if (SysCacheRelationOid[i] != SysCacheRelationOid[j])
-			SysCacheRelationOid[++j] = SysCacheRelationOid[i];
-	}
-	SysCacheRelationOidSize = j + 1;
+	SysCacheRelationOidSize =
+		qunique(SysCacheRelationOid, SysCacheRelationOidSize, sizeof(Oid),
+				oid_compare);
 
 	pg_qsort(SysCacheSupportingRelOid, SysCacheSupportingRelOidSize,
 			 sizeof(Oid), oid_compare);
-	for (i = 1, j = 0; i < SysCacheSupportingRelOidSize; i++)
-	{
-		if (SysCacheSupportingRelOid[i] != SysCacheSupportingRelOid[j])
-			SysCacheSupportingRelOid[++j] = SysCacheSupportingRelOid[i];
-	}
-	SysCacheSupportingRelOidSize = j + 1;
+	SysCacheSupportingRelOidSize =
+		qunique(SysCacheSupportingRelOid, SysCacheSupportingRelOidSize,
+				sizeof(Oid), oid_compare);
 
 	CacheInitialized = true;
 }
diff --git a/src/include/lib/qunique.h b/src/include/lib/qunique.h
new file mode 100644
index 0000000000..4d620f82ee
--- /dev/null
+++ b/src/include/lib/qunique.h
@@ -0,0 +1,65 @@
+/*-------------------------------------------------------------------------
+ *
+ * qunique.h
+ *		inline array unique functions
+ * Portions Copyright (c) 2019, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *		src/include/lib/qunique.h
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef QUNIQUE_H
+#define QUNIQUE_H
+
+/*
+ * Remove duplicates from a pre-sorted array, according to a user-supplied
+ * comparator.  Usually the array should have been sorted with qsort() using
+ * the same arguments.  Return the new size.
+ */
+static inline size_t
+qunique(void *array, size_t elements, size_t width,
+		int (*compare) (const void *, const void *))
+{
+	char	   *bytes = (char *) array;
+	size_t		i,
+				j;
+
+	if (elements <= 1)
+		return elements;
+
+	for (i = 1, j = 0; i < elements; ++i)
+	{
+		if (compare(bytes + i * width, bytes + j * width) != 0)
+			memcpy(bytes + ++j * width, bytes + i * width, width);
+	}
+
+	return j + 1;
+}
+
+/*
+ * Like qunique(), but takes a comparator with an extra user data argument
+ * which is passed through, for compatibility with qsort_arg().
+ */
+static inline size_t
+qunique_arg(void *array, size_t elements, size_t width,
+			int (*compare) (const void *, const void *, void *),
+			void *arg)
+{
+	char	   *bytes = (char *) array;
+	size_t		i,
+				j;
+
+	if (elements <= 1)
+		return elements;
+
+	for (i = 1, j = 0; i < elements; ++i)
+	{
+		if (compare(bytes + i * width, bytes + j * width, arg) != 0)
+			memcpy(bytes + ++j * width, bytes + i * width, width);
+	}
+
+	return j + 1;
+}
+
+#endif							/* QUNIQUE_H */
-- 
2.22.0

#5Thomas Munro
thomas.munro@gmail.com
In reply to: Thomas Munro (#4)
1 attachment(s)
Re: Consolidate 'unique array values' logic into a reusable function?

On Tue, Sep 10, 2019 at 11:43 AM Thomas Munro <thomas.munro@gmail.com> wrote:

Rebased due to bitrot. Spotted one more place to use this, in
src/backend/utils/adt/txid.c.

Rebased. I'm planning to commit this soon.

Attachments:

0001-Consolidate-code-that-makes-a-sorted-array-unique-v3.patchapplication/octet-stream; name=0001-Consolidate-code-that-makes-a-sorted-array-unique-v3.patchDownload
From c5aedd5bcd917a8c4f987b5dbcdd039a4121e5b6 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Fri, 30 Aug 2019 13:41:04 +1200
Subject: [PATCH] Consolidate code that makes a sorted array unique.

Introduce qunique() and qunique_arg() which can be used after qsort()
and qsort_arg() respectively to remove duplicate values.

Author: Thomas Munro
Reviewed-by: Tom Lane
Discussion: https://postgr.es/m/CAEepm%3D2vmFTNpAmwbGGD2WaryM6T3hSDVKQPfUwjdD_5XY6vAA%40mail.gmail.com
---
 contrib/hstore/hstore_io.c           |  5 +++
 contrib/intarray/_int_tool.c         | 19 +++-----
 contrib/pg_trgm/trgm_op.c            | 25 ++---------
 src/backend/access/nbtree/nbtutils.c | 19 ++------
 src/backend/executor/nodeTidscan.c   | 13 ++----
 src/backend/utils/adt/acl.c          | 15 ++-----
 src/backend/utils/adt/tsgistidx.c    | 29 ++-----------
 src/backend/utils/adt/tsquery_op.c   | 29 ++-----------
 src/backend/utils/adt/tsvector.c     |  5 ++-
 src/backend/utils/adt/tsvector_op.c  | 59 ++++---------------------
 src/backend/utils/adt/txid.c         | 19 +-------
 src/backend/utils/cache/syscache.c   | 21 +++------
 src/include/lib/qunique.h            | 65 ++++++++++++++++++++++++++++
 13 files changed, 115 insertions(+), 208 deletions(-)
 create mode 100644 src/include/lib/qunique.h

diff --git a/contrib/hstore/hstore_io.c b/contrib/hstore/hstore_io.c
index be3cce4f74..10ec392775 100644
--- a/contrib/hstore/hstore_io.c
+++ b/contrib/hstore/hstore_io.c
@@ -322,6 +322,11 @@ hstoreUniquePairs(Pairs *a, int32 l, int32 *buflen)
 	}
 
 	qsort((void *) a, l, sizeof(Pairs), comparePairs);
+
+	/*
+	 * We can't use qunique here because we have some clean-up code to run on
+	 * removed elements.
+	 */
 	ptr = a + 1;
 	res = a;
 	while (ptr - a < l)
diff --git a/contrib/intarray/_int_tool.c b/contrib/intarray/_int_tool.c
index efff81d77d..e5f4bd4064 100644
--- a/contrib/intarray/_int_tool.c
+++ b/contrib/intarray/_int_tool.c
@@ -7,6 +7,7 @@
 
 #include "_int.h"
 #include "catalog/pg_type.h"
+#include "lib/qunique.h"
 
 /* arguments are assumed sorted & unique-ified */
 bool
@@ -308,23 +309,13 @@ internal_size(int *a, int len)
 ArrayType *
 _int_unique(ArrayType *r)
 {
-	int		   *tmp,
-			   *dr,
-			   *data;
 	int			num = ARRNELEMS(r);
+	bool		duplicates_found;	/* not used */
 
-	if (num < 2)
-		return r;
+	num = qunique_arg(ARRPTR(r), num, sizeof(int), isort_cmp,
+					  &duplicates_found);
 
-	data = tmp = dr = ARRPTR(r);
-	while (tmp - data < num)
-	{
-		if (*tmp != *dr)
-			*(++dr) = *tmp++;
-		else
-			tmp++;
-	}
-	return resize_intArrayType(r, dr + 1 - ARRPTR(r));
+	return resize_intArrayType(r, num);
 }
 
 void
diff --git a/contrib/pg_trgm/trgm_op.c b/contrib/pg_trgm/trgm_op.c
index 07a3180378..c9c8cbc734 100644
--- a/contrib/pg_trgm/trgm_op.c
+++ b/contrib/pg_trgm/trgm_op.c
@@ -6,6 +6,7 @@
 #include <ctype.h>
 
 #include "catalog/pg_type.h"
+#include "lib/qunique.h"
 #include "trgm.h"
 #include "tsearch/ts_locale.h"
 #include "utils/lsyscache.h"
@@ -162,26 +163,6 @@ comp_trgm(const void *a, const void *b)
 	return CMPTRGM(a, b);
 }
 
-static int
-unique_array(trgm *a, int len)
-{
-	trgm	   *curend,
-			   *tmp;
-
-	curend = tmp = a;
-	while (tmp - a < len)
-		if (CMPTRGM(tmp, curend))
-		{
-			curend++;
-			CPTRGM(curend, tmp);
-			tmp++;
-		}
-		else
-			tmp++;
-
-	return curend + 1 - a;
-}
-
 /*
  * Finds first word in string, returns pointer to the word,
  * endword points to the character after word
@@ -394,7 +375,7 @@ generate_trgm(char *str, int slen)
 	if (len > 1)
 	{
 		qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm);
-		len = unique_array(GETARR(trg), len);
+		len = qunique(GETARR(trg), len, sizeof(trgm), comp_trgm);
 	}
 
 	SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
@@ -942,7 +923,7 @@ generate_wildcard_trgm(const char *str, int slen)
 	if (len > 1)
 	{
 		qsort((void *) GETARR(trg), len, sizeof(trgm), comp_trgm);
-		len = unique_array(GETARR(trg), len);
+		len = qunique(GETARR(trg), len, sizeof(trgm), comp_trgm);
 	}
 
 	SET_VARSIZE(trg, CALCGTSIZE(ARRKEY, len));
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index bc855dd25d..6a3008dd48 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -21,6 +21,7 @@
 #include "access/reloptions.h"
 #include "access/relscan.h"
 #include "commands/progress.h"
+#include "lib/qunique.h"
 #include "miscadmin.h"
 #include "utils/array.h"
 #include "utils/datum.h"
@@ -435,8 +436,6 @@ _bt_sort_array_elements(IndexScanDesc scan, ScanKey skey,
 	Oid			elemtype;
 	RegProcedure cmp_proc;
 	BTSortArrayContext cxt;
-	int			last_non_dup;
-	int			i;
 
 	if (nelems <= 1)
 		return nelems;			/* no work to do */
@@ -475,20 +474,8 @@ _bt_sort_array_elements(IndexScanDesc scan, ScanKey skey,
 			  _bt_compare_array_elements, (void *) &cxt);
 
 	/* Now scan the sorted elements and remove duplicates */
-	last_non_dup = 0;
-	for (i = 1; i < nelems; i++)
-	{
-		int32		compare;
-
-		compare = DatumGetInt32(FunctionCall2Coll(&cxt.flinfo,
-												  cxt.collation,
-												  elems[last_non_dup],
-												  elems[i]));
-		if (compare != 0)
-			elems[++last_non_dup] = elems[i];
-	}
-
-	return last_non_dup + 1;
+	return qunique_arg(elems, nelems, sizeof(Datum),
+					   _bt_compare_array_elements, &cxt);
 }
 
 /*
diff --git a/src/backend/executor/nodeTidscan.c b/src/backend/executor/nodeTidscan.c
index 8cf22d5bf0..a59480f906 100644
--- a/src/backend/executor/nodeTidscan.c
+++ b/src/backend/executor/nodeTidscan.c
@@ -27,6 +27,7 @@
 #include "catalog/pg_type.h"
 #include "executor/execdebug.h"
 #include "executor/nodeTidscan.h"
+#include "lib/qunique.h"
 #include "miscadmin.h"
 #include "nodes/nodeFuncs.h"
 #include "storage/bufmgr.h"
@@ -260,21 +261,13 @@ TidListEval(TidScanState *tidstate)
 	 */
 	if (numTids > 1)
 	{
-		int			lastTid;
-		int			i;
-
 		/* CurrentOfExpr could never appear OR'd with something else */
 		Assert(!tidstate->tss_isCurrentOf);
 
 		qsort((void *) tidList, numTids, sizeof(ItemPointerData),
 			  itemptr_comparator);
-		lastTid = 0;
-		for (i = 1; i < numTids; i++)
-		{
-			if (!ItemPointerEquals(&tidList[lastTid], &tidList[i]))
-				tidList[++lastTid] = tidList[i];
-		}
-		numTids = lastTid + 1;
+		numTids = qunique(tidList, numTids, sizeof(ItemPointerData),
+						  itemptr_comparator);
 	}
 
 	tidstate->tss_TidList = tidList;
diff --git a/src/backend/utils/adt/acl.c b/src/backend/utils/adt/acl.c
index d7e6100ccb..79bc750d77 100644
--- a/src/backend/utils/adt/acl.c
+++ b/src/backend/utils/adt/acl.c
@@ -28,6 +28,7 @@
 #include "commands/tablespace.h"
 #include "foreign/foreign.h"
 #include "funcapi.h"
+#include "lib/qunique.h"
 #include "miscadmin.h"
 #include "utils/acl.h"
 #include "utils/array.h"
@@ -1475,8 +1476,7 @@ aclmembers(const Acl *acl, Oid **roleids)
 	Oid		   *list;
 	const AclItem *acldat;
 	int			i,
-				j,
-				k;
+				j;
 
 	if (acl == NULL || ACL_NUM(acl) == 0)
 	{
@@ -1508,21 +1508,14 @@ aclmembers(const Acl *acl, Oid **roleids)
 	/* Sort the array */
 	qsort(list, j, sizeof(Oid), oid_cmp);
 
-	/* Remove duplicates from the array */
-	k = 0;
-	for (i = 1; i < j; i++)
-	{
-		if (list[k] != list[i])
-			list[++k] = list[i];
-	}
-
 	/*
 	 * We could repalloc the array down to minimum size, but it's hardly worth
 	 * it since it's only transient memory.
 	 */
 	*roleids = list;
 
-	return k + 1;
+	/* Remove duplicates from the array */
+	return qunique(list, j, sizeof(Oid), oid_cmp);
 }
 
 
diff --git a/src/backend/utils/adt/tsgistidx.c b/src/backend/utils/adt/tsgistidx.c
index 6ff71a49b8..af5b6809c5 100644
--- a/src/backend/utils/adt/tsgistidx.c
+++ b/src/backend/utils/adt/tsgistidx.c
@@ -16,6 +16,7 @@
 
 #include "access/gist.h"
 #include "access/heaptoast.h"
+#include "lib/qunique.h"
 #include "port/pg_bitutils.h"
 #include "tsearch/ts_utils.h"
 #include "utils/builtins.h"
@@ -122,31 +123,6 @@ compareint(const void *va, const void *vb)
 	return (a > b) ? 1 : -1;
 }
 
-/*
- * Removes duplicates from an array of int32. 'l' is
- * size of the input array. Returns the new size of the array.
- */
-static int
-uniqueint(int32 *a, int32 l)
-{
-	int32	   *ptr,
-			   *res;
-
-	if (l <= 1)
-		return l;
-
-	ptr = res = a;
-
-	qsort((void *) a, l, sizeof(int32), compareint);
-
-	while (ptr - a < l)
-		if (*ptr != *res)
-			*(++res) = *ptr++;
-		else
-			ptr++;
-	return res + 1 - a;
-}
-
 static void
 makesign(BITVECP sign, SignTSVector *a)
 {
@@ -193,7 +169,8 @@ gtsvector_compress(PG_FUNCTION_ARGS)
 			ptr++;
 		}
 
-		len = uniqueint(GETARR(res), val->size);
+		qsort(GETARR(res), val->size, sizeof(int), compareint);
+		len = qunique(GETARR(res), val->size, sizeof(int), compareint);
 		if (len != val->size)
 		{
 			/*
diff --git a/src/backend/utils/adt/tsquery_op.c b/src/backend/utils/adt/tsquery_op.c
index 1f63d9b6a9..e6816940ad 100644
--- a/src/backend/utils/adt/tsquery_op.c
+++ b/src/backend/utils/adt/tsquery_op.c
@@ -14,6 +14,7 @@
 
 #include "postgres.h"
 
+#include "lib/qunique.h"
 #include "tsearch/ts_utils.h"
 #include "utils/builtins.h"
 
@@ -302,29 +303,6 @@ cmp_string(const void *a, const void *b)
 	return strcmp(sa, sb);
 }
 
-static int
-remove_duplicates(char **strings, int n)
-{
-	if (n <= 1)
-		return n;
-	else
-	{
-		int			i;
-		char	   *prev = strings[0];
-		int			new_n = 1;
-
-		for (i = 1; i < n; i++)
-		{
-			if (strcmp(strings[i], prev) != 0)
-			{
-				strings[new_n++] = strings[i];
-				prev = strings[i];
-			}
-		}
-		return new_n;
-	}
-}
-
 Datum
 tsq_mcontains(PG_FUNCTION_ARGS)
 {
@@ -342,9 +320,10 @@ tsq_mcontains(PG_FUNCTION_ARGS)
 
 	/* Sort and remove duplicates from both arrays */
 	qsort(query_values, query_nvalues, sizeof(char *), cmp_string);
-	query_nvalues = remove_duplicates(query_values, query_nvalues);
+	query_nvalues = qunique(query_values, query_nvalues, sizeof(char *),
+							cmp_string);
 	qsort(ex_values, ex_nvalues, sizeof(char *), cmp_string);
-	ex_nvalues = remove_duplicates(ex_values, ex_nvalues);
+	ex_nvalues = qunique(ex_values, ex_nvalues, sizeof(char *), cmp_string);
 
 	if (ex_nvalues > query_nvalues)
 		result = false;
diff --git a/src/backend/utils/adt/tsvector.c b/src/backend/utils/adt/tsvector.c
index ccfc4147fa..098eaed3e5 100644
--- a/src/backend/utils/adt/tsvector.c
+++ b/src/backend/utils/adt/tsvector.c
@@ -41,8 +41,9 @@ compareWordEntryPos(const void *a, const void *b)
 }
 
 /*
- * Removes duplicate pos entries. If there's two entries with same pos
- * but different weight, the higher weight is retained.
+ * Removes duplicate pos entries. If there's two entries with same pos but
+ * different weight, the higher weight is retained, so we can't use
+ * qunique here.
  *
  * Returns new length.
  */
diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c
index 28d042273e..f483c4b228 100644
--- a/src/backend/utils/adt/tsvector_op.c
+++ b/src/backend/utils/adt/tsvector_op.c
@@ -21,6 +21,7 @@
 #include "commands/trigger.h"
 #include "executor/spi.h"
 #include "funcapi.h"
+#include "lib/qunique.h"
 #include "mb/pg_wchar.h"
 #include "miscadmin.h"
 #include "parser/parse_coerce.h"
@@ -475,16 +476,9 @@ tsvector_delete_by_indices(TSVector tsv, int *indices_to_delete,
 	 */
 	if (indices_count > 1)
 	{
-		int			kp;
-
 		qsort(indices_to_delete, indices_count, sizeof(int), compare_int);
-		kp = 0;
-		for (k = 1; k < indices_count; k++)
-		{
-			if (indices_to_delete[k] != indices_to_delete[kp])
-				indices_to_delete[++kp] = indices_to_delete[k];
-		}
-		indices_count = ++kp;
+		indices_count = qunique(indices_to_delete, indices_count, sizeof(int),
+								compare_int);
 	}
 
 	/*
@@ -761,7 +755,6 @@ array_to_tsvector(PG_FUNCTION_ARGS)
 	bool	   *nulls;
 	int			nitems,
 				i,
-				j,
 				tslen,
 				datalen = 0;
 	char	   *cur;
@@ -781,13 +774,8 @@ array_to_tsvector(PG_FUNCTION_ARGS)
 	if (nitems > 1)
 	{
 		qsort(dlexemes, nitems, sizeof(Datum), compare_text_lexemes);
-		j = 0;
-		for (i = 1; i < nitems; i++)
-		{
-			if (compare_text_lexemes(&dlexemes[j], &dlexemes[i]) < 0)
-				dlexemes[++j] = dlexemes[i];
-		}
-		nitems = ++j;
+		nitems = qunique(dlexemes, nitems, sizeof(Datum),
+						 compare_text_lexemes);
 	}
 
 	/* Calculate space needed for surviving lexemes. */
@@ -1270,39 +1258,6 @@ checkclass_str(CHKVAL *chkval, WordEntry *entry, QueryOperand *val,
 	return result;
 }
 
-/*
- * Removes duplicate pos entries. We can't use uniquePos() from
- * tsvector.c because array might be longer than MAXENTRYPOS
- *
- * Returns new length.
- */
-static int
-uniqueLongPos(WordEntryPos *pos, int npos)
-{
-	WordEntryPos *pos_iter,
-			   *result;
-
-	if (npos <= 1)
-		return npos;
-
-	qsort((void *) pos, npos, sizeof(WordEntryPos), compareWordEntryPos);
-
-	result = pos;
-	pos_iter = pos + 1;
-	while (pos_iter < pos + npos)
-	{
-		if (WEP_GETPOS(*pos_iter) != WEP_GETPOS(*result))
-		{
-			result++;
-			*result = WEP_GETPOS(*pos_iter);
-		}
-
-		pos_iter++;
-	}
-
-	return result + 1 - pos;
-}
-
 /*
  * is there value 'val' in array or not ?
  */
@@ -1397,7 +1352,9 @@ checkcondition_str(void *checkval, QueryOperand *val, ExecPhraseData *data)
 		{
 			/* Sort and make unique array of found positions */
 			data->pos = allpos;
-			data->npos = uniqueLongPos(allpos, npos);
+			qsort(data->pos, npos, sizeof(WordEntryPos), compareWordEntryPos);
+			data->npos = qunique(data->pos, npos, sizeof(WordEntryPos),
+								 compareWordEntryPos);
 			data->allocated = true;
 		}
 	}
diff --git a/src/backend/utils/adt/txid.c b/src/backend/utils/adt/txid.c
index 90b2c9b694..e220c5f136 100644
--- a/src/backend/utils/adt/txid.c
+++ b/src/backend/utils/adt/txid.c
@@ -27,6 +27,7 @@
 #include "access/xlog.h"
 #include "funcapi.h"
 #include "miscadmin.h"
+#include "lib/qunique.h"
 #include "libpq/pqformat.h"
 #include "postmaster/postmaster.h"
 #include "storage/lwlock.h"
@@ -213,26 +214,10 @@ cmp_txid(const void *aa, const void *bb)
 static void
 sort_snapshot(TxidSnapshot *snap)
 {
-	txid		last = 0;
-	int			nxip,
-				idx1,
-				idx2;
-
 	if (snap->nxip > 1)
 	{
 		qsort(snap->xip, snap->nxip, sizeof(txid), cmp_txid);
-
-		/* remove duplicates */
-		nxip = snap->nxip;
-		idx1 = idx2 = 0;
-		while (idx1 < nxip)
-		{
-			if (snap->xip[idx1] != last)
-				last = snap->xip[idx2++] = snap->xip[idx1];
-			else
-				snap->nxip--;
-			idx1++;
-		}
+		snap->nxip = qunique(snap->xip, snap->nxip, sizeof(txid), cmp_txid);
 	}
 }
 
diff --git a/src/backend/utils/cache/syscache.c b/src/backend/utils/cache/syscache.c
index 16297a52a1..36aee74ab0 100644
--- a/src/backend/utils/cache/syscache.c
+++ b/src/backend/utils/cache/syscache.c
@@ -74,6 +74,7 @@
 #include "catalog/pg_ts_template.h"
 #include "catalog/pg_type.h"
 #include "catalog/pg_user_mapping.h"
+#include "lib/qunique.h"
 #include "utils/rel.h"
 #include "utils/catcache.h"
 #include "utils/syscache.h"
@@ -1010,8 +1011,6 @@ void
 InitCatalogCache(void)
 {
 	int			cacheId;
-	int			i,
-				j;
 
 	StaticAssertStmt(SysCacheSize == (int) lengthof(cacheinfo),
 					 "SysCacheSize does not match syscache.c's array");
@@ -1048,21 +1047,15 @@ InitCatalogCache(void)
 	/* Sort and de-dup OID arrays, so we can use binary search. */
 	pg_qsort(SysCacheRelationOid, SysCacheRelationOidSize,
 			 sizeof(Oid), oid_compare);
-	for (i = 1, j = 0; i < SysCacheRelationOidSize; i++)
-	{
-		if (SysCacheRelationOid[i] != SysCacheRelationOid[j])
-			SysCacheRelationOid[++j] = SysCacheRelationOid[i];
-	}
-	SysCacheRelationOidSize = j + 1;
+	SysCacheRelationOidSize =
+		qunique(SysCacheRelationOid, SysCacheRelationOidSize, sizeof(Oid),
+				oid_compare);
 
 	pg_qsort(SysCacheSupportingRelOid, SysCacheSupportingRelOidSize,
 			 sizeof(Oid), oid_compare);
-	for (i = 1, j = 0; i < SysCacheSupportingRelOidSize; i++)
-	{
-		if (SysCacheSupportingRelOid[i] != SysCacheSupportingRelOid[j])
-			SysCacheSupportingRelOid[++j] = SysCacheSupportingRelOid[i];
-	}
-	SysCacheSupportingRelOidSize = j + 1;
+	SysCacheSupportingRelOidSize =
+		qunique(SysCacheSupportingRelOid, SysCacheSupportingRelOidSize,
+				sizeof(Oid), oid_compare);
 
 	CacheInitialized = true;
 }
diff --git a/src/include/lib/qunique.h b/src/include/lib/qunique.h
new file mode 100644
index 0000000000..4d620f82ee
--- /dev/null
+++ b/src/include/lib/qunique.h
@@ -0,0 +1,65 @@
+/*-------------------------------------------------------------------------
+ *
+ * qunique.h
+ *		inline array unique functions
+ * Portions Copyright (c) 2019, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *		src/include/lib/qunique.h
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef QUNIQUE_H
+#define QUNIQUE_H
+
+/*
+ * Remove duplicates from a pre-sorted array, according to a user-supplied
+ * comparator.  Usually the array should have been sorted with qsort() using
+ * the same arguments.  Return the new size.
+ */
+static inline size_t
+qunique(void *array, size_t elements, size_t width,
+		int (*compare) (const void *, const void *))
+{
+	char	   *bytes = (char *) array;
+	size_t		i,
+				j;
+
+	if (elements <= 1)
+		return elements;
+
+	for (i = 1, j = 0; i < elements; ++i)
+	{
+		if (compare(bytes + i * width, bytes + j * width) != 0)
+			memcpy(bytes + ++j * width, bytes + i * width, width);
+	}
+
+	return j + 1;
+}
+
+/*
+ * Like qunique(), but takes a comparator with an extra user data argument
+ * which is passed through, for compatibility with qsort_arg().
+ */
+static inline size_t
+qunique_arg(void *array, size_t elements, size_t width,
+			int (*compare) (const void *, const void *, void *),
+			void *arg)
+{
+	char	   *bytes = (char *) array;
+	size_t		i,
+				j;
+
+	if (elements <= 1)
+		return elements;
+
+	for (i = 1, j = 0; i < elements; ++i)
+	{
+		if (compare(bytes + i * width, bytes + j * width, arg) != 0)
+			memcpy(bytes + ++j * width, bytes + i * width, width);
+	}
+
+	return j + 1;
+}
+
+#endif							/* QUNIQUE_H */
-- 
2.23.0

#6Noah Misch
noah@leadboat.com
In reply to: Thomas Munro (#5)
1 attachment(s)
Re: Consolidate 'unique array values' logic into a reusable function?

On Mon, Nov 04, 2019 at 12:02:21PM +1300, Thomas Munro wrote:

Rebased. I'm planning to commit this soon.

In each installcheck-parallel run under valgrind-3.14.0, I now see ~1200
reports like this:

==00:00:00:28.322 1527557== Source and destination overlap in memcpy(0x1000104, 0x1000104, 4)
==00:00:00:28.322 1527557== at 0x4C2E74D: memcpy@@GLIBC_2.14 (vg_replace_strmem.c:1035)
==00:00:00:28.322 1527557== by 0xA9A57B: qunique (qunique.h:34)
==00:00:00:28.322 1527557== by 0xA9A843: InitCatalogCache (syscache.c:1056)
==00:00:00:28.322 1527557== by 0xAB6B18: InitPostgres (postinit.c:682)
==00:00:00:28.322 1527557== by 0x91F98E: PostgresMain (postgres.c:3909)
==00:00:00:28.322 1527557== by 0x872DE9: BackendRun (postmaster.c:4498)
==00:00:00:28.322 1527557== by 0x8725B3: BackendStartup (postmaster.c:4189)
==00:00:00:28.322 1527557== by 0x86E7F4: ServerLoop (postmaster.c:1727)
==00:00:00:28.322 1527557== by 0x86E0AA: PostmasterMain (postmaster.c:1400)
==00:00:00:28.322 1527557== by 0x77CB56: main (main.c:210)
==00:00:00:28.322 1527557==
{
<insert_a_suppression_name_here>
Memcheck:Overlap
fun:memcpy@@GLIBC_2.14
fun:qunique
fun:InitCatalogCache
fun:InitPostgres
fun:PostgresMain
fun:BackendRun
fun:BackendStartup
fun:ServerLoop
fun:PostmasterMain
fun:main
}

This is like the problem fixed in 9a9473f; the precedent from there would be
to test src!=dst before calling mempcy(), e.g. as attached. I suppose the
alternative would be to add a suppression like the one 9a9473f removed.

I do wonder why the Valgrind buildfarm animals haven't noticed.

Attachments:

qunique-valgrind-v1.patchtext/x-diff; charset=us-asciiDownload
diff --git a/src/include/lib/qunique.h b/src/include/lib/qunique.h
index 4d620f8..fc539ca 100644
--- a/src/include/lib/qunique.h
+++ b/src/include/lib/qunique.h
@@ -30,8 +30,9 @@ qunique(void *array, size_t elements, size_t width,
 
 	for (i = 1, j = 0; i < elements; ++i)
 	{
-		if (compare(bytes + i * width, bytes + j * width) != 0)
-			memcpy(bytes + ++j * width, bytes + i * width, width);
+		if (compare(bytes + i * width, bytes + j * width) != 0 &&
+			++j != i)
+			memcpy(bytes + j * width, bytes + i * width, width);
 	}
 
 	return j + 1;
@@ -55,8 +56,9 @@ qunique_arg(void *array, size_t elements, size_t width,
 
 	for (i = 1, j = 0; i < elements; ++i)
 	{
-		if (compare(bytes + i * width, bytes + j * width, arg) != 0)
-			memcpy(bytes + ++j * width, bytes + i * width, width);
+		if (compare(bytes + i * width, bytes + j * width, arg) != 0 &&
+			++j != i)
+			memcpy(bytes + j * width, bytes + i * width, width);
 	}
 
 	return j + 1;
#7Thomas Munro
thomas.munro@gmail.com
In reply to: Noah Misch (#6)
Re: Consolidate 'unique array values' logic into a reusable function?

On Sun, Dec 29, 2019 at 8:02 PM Noah Misch <noah@leadboat.com> wrote:

==00:00:00:28.322 1527557== Source and destination overlap in memcpy(0x1000104, 0x1000104, 4)
==00:00:00:28.322 1527557== at 0x4C2E74D: memcpy@@GLIBC_2.14 (vg_replace_strmem.c:1035)
==00:00:00:28.322 1527557== by 0xA9A57B: qunique (qunique.h:34)
==00:00:00:28.322 1527557== by 0xA9A843: InitCatalogCache (syscache.c:1056)
==00:00:00:28.322 1527557== by 0xAB6B18: InitPostgres (postinit.c:682)
==00:00:00:28.322 1527557== by 0x91F98E: PostgresMain (postgres.c:3909)
==00:00:00:28.322 1527557== by 0x872DE9: BackendRun (postmaster.c:4498)
==00:00:00:28.322 1527557== by 0x8725B3: BackendStartup (postmaster.c:4189)
==00:00:00:28.322 1527557== by 0x86E7F4: ServerLoop (postmaster.c:1727)
==00:00:00:28.322 1527557== by 0x86E0AA: PostmasterMain (postmaster.c:1400)
==00:00:00:28.322 1527557== by 0x77CB56: main (main.c:210)
==00:00:00:28.322 1527557==
{
<insert_a_suppression_name_here>
Memcheck:Overlap
fun:memcpy@@GLIBC_2.14
fun:qunique
fun:InitCatalogCache
fun:InitPostgres
fun:PostgresMain
fun:BackendRun
fun:BackendStartup
fun:ServerLoop
fun:PostmasterMain
fun:main
}

This is like the problem fixed in 9a9473f; the precedent from there would be
to test src!=dst before calling mempcy(), e.g. as attached. I suppose the
alternative would be to add a suppression like the one 9a9473f removed.

Thanks for fixing that.

I do wonder why the Valgrind buildfarm animals haven't noticed.

Optimisation levels? For example, I see that skink is using -Og, at
which level my local GCC inlines qunique() and memcpy() so that in the
case you quoted there's a MOV instruction and valgrind has nothing to
complain about.

#8Noah Misch
noah@leadboat.com
In reply to: Thomas Munro (#7)
Re: Consolidate 'unique array values' logic into a reusable function?

On Wed, Jan 08, 2020 at 02:49:48PM +1300, Thomas Munro wrote:

On Sun, Dec 29, 2019 at 8:02 PM Noah Misch <noah@leadboat.com> wrote:

==00:00:00:28.322 1527557== Source and destination overlap in memcpy(0x1000104, 0x1000104, 4)
==00:00:00:28.322 1527557== at 0x4C2E74D: memcpy@@GLIBC_2.14 (vg_replace_strmem.c:1035)
==00:00:00:28.322 1527557== by 0xA9A57B: qunique (qunique.h:34)
==00:00:00:28.322 1527557== by 0xA9A843: InitCatalogCache (syscache.c:1056)
==00:00:00:28.322 1527557== by 0xAB6B18: InitPostgres (postinit.c:682)
==00:00:00:28.322 1527557== by 0x91F98E: PostgresMain (postgres.c:3909)
==00:00:00:28.322 1527557== by 0x872DE9: BackendRun (postmaster.c:4498)
==00:00:00:28.322 1527557== by 0x8725B3: BackendStartup (postmaster.c:4189)
==00:00:00:28.322 1527557== by 0x86E7F4: ServerLoop (postmaster.c:1727)
==00:00:00:28.322 1527557== by 0x86E0AA: PostmasterMain (postmaster.c:1400)
==00:00:00:28.322 1527557== by 0x77CB56: main (main.c:210)
==00:00:00:28.322 1527557==
{
<insert_a_suppression_name_here>
Memcheck:Overlap
fun:memcpy@@GLIBC_2.14
fun:qunique
fun:InitCatalogCache
fun:InitPostgres
fun:PostgresMain
fun:BackendRun
fun:BackendStartup
fun:ServerLoop
fun:PostmasterMain
fun:main
}

This is like the problem fixed in 9a9473f; the precedent from there would be
to test src!=dst before calling mempcy(), e.g. as attached. I suppose the
alternative would be to add a suppression like the one 9a9473f removed.

Thanks for fixing that.

Glad to help.

I do wonder why the Valgrind buildfarm animals haven't noticed.

Optimisation levels? For example, I see that skink is using -Og, at
which level my local GCC inlines qunique() and memcpy() so that in the
case you quoted there's a MOV instruction and valgrind has nothing to
complain about.

That explains it. I would have been using -O0. (It would be a nice bonus to
have both an -O0 Valgrind buildfarm animal and an optimized Valgrind animal,
since neither catches everything.)