From 7c43245b80a7b278bbee3ec40dad9377296b1e41 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Thu, 7 Jul 2016 14:47:19 -0700
Subject: [PATCH 08/20] WIP: Use more efficient hashtable for tidbitmap.c

---
 src/backend/nodes/tidbitmap.c | 357 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 355 insertions(+), 2 deletions(-)

diff --git a/src/backend/nodes/tidbitmap.c b/src/backend/nodes/tidbitmap.c
index dfeb7d5..26b8f7a 100644
--- a/src/backend/nodes/tidbitmap.c
+++ b/src/backend/nodes/tidbitmap.c
@@ -97,6 +97,7 @@
 typedef struct PagetableEntry
 {
 	BlockNumber blockno;		/* page number (hashtable key) */
+	char		status;
 	bool		ischunk;		/* T = lossy storage, F = exact */
 	bool		recheck;		/* should the tuples be rechecked? */
 	bitmapword	words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)];
@@ -129,6 +130,7 @@ struct TIDBitmap
 	MemoryContext mcxt;			/* memory context containing me */
 	TBMStatus	status;			/* see codes above */
 	HTAB	   *pagetable;		/* hash table of PagetableEntry's */
+	struct simplehash *pagetable2;
 	int			nentries;		/* number of entries in pagetable */
 	int			maxentries;		/* limit on same to meet maxbytes */
 	int			npages;			/* number of exact entries in pagetable */
@@ -168,6 +170,237 @@ static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno);
 static void tbm_lossify(TIDBitmap *tbm);
 static int	tbm_comparator(const void *left, const void *right);
 
+#define USE_SIMPLEHASH
+
+typedef struct simplehash
+{
+	uint32 size;
+	uint32 members;
+	uint32 deleted;
+	PagetableEntry *data;
+	MemoryContext ctx;
+	bool resizing;
+} simplehash;
+
+
+static inline uint32
+hash_blockno(BlockNumber b)
+{
+	uint32 h = b;
+	h ^= h >> 16;
+	h *= 0x85ebca6b;
+	h ^= h >> 13;
+	h *= 0xc2b2ae35;
+	h ^= h >> 16;
+	return h;
+}
+
+static PagetableEntry *
+simplehash_insert(simplehash *tb, BlockNumber key, bool *found);
+
+static simplehash *
+simplehash_create(MemoryContext ctx, uint32 size)
+{
+	simplehash *tb;
+
+	/* FIXME: assert size is a power of two */
+	tb = MemoryContextAllocZero(ctx, sizeof(simplehash));
+	tb->ctx = ctx;
+	tb->size = size;
+	tb->data = MemoryContextAllocZero(ctx, sizeof(PagetableEntry) * tb->size);
+
+	return tb;
+}
+
+static void
+simplehash_destroy(simplehash *tb)
+{
+	pfree(tb->data);
+	pfree(tb);
+}
+
+static void
+simplehash_resize(simplehash *tb, uint32 newsize)
+{
+	uint32 oldsize = tb->size;
+	PagetableEntry *olddata = tb->data;
+	int i;
+
+	Assert(!tb->resizing);
+	tb->resizing = true;
+	tb->size = newsize;
+	tb->data = MemoryContextAllocExtended(tb->ctx, sizeof(PagetableEntry) * tb->size,
+										  MCXT_ALLOC_HUGE | MCXT_ALLOC_ZERO);
+
+	ereport(DEBUG2,
+			(errmsg("resizing: %u to %u", oldsize, tb->size),
+			 errhidestmt(true)));
+
+	for (i = 0; i < oldsize; i++)
+	{
+		PagetableEntry *oldentry = &olddata[i];
+		if (oldentry->status == 1)
+		{
+			PagetableEntry *newentry;
+			bool found;
+			newentry = simplehash_insert(tb, oldentry->blockno, &found);
+			if (found)
+				elog(ERROR, "duplicate during resizing");
+
+			memcpy(newentry, oldentry, sizeof(PagetableEntry));
+		}
+	}
+
+	pfree(olddata);
+	tb->resizing = false;
+	tb->deleted = 0;
+}
+
+static PagetableEntry *
+simplehash_insert(simplehash *tb, BlockNumber key, bool *found)
+{
+	uint32 hash = hash_blockno(key);
+	uint32 startelem;
+	uint32 curelem;
+
+restart:
+	startelem = hash & (tb->size - 1);
+	curelem = startelem;
+
+	/* 10% toombstones */
+	if (!tb->resizing &&
+		tb->deleted * 10 >= tb->size)
+	{
+		elog(LOG, "compacting");
+		simplehash_resize(tb, tb->size);
+	}
+
+	while(true)
+	{
+		PagetableEntry *entry = &tb->data[curelem];
+
+		if (entry->status == 0)
+		{
+			/* resize at 75% */
+			if (!tb->resizing &&
+				((tb->members + 1) * 4 > tb->size * 3))
+			{
+				simplehash_resize(tb, tb->size * 2);
+
+				/* start again */
+				goto restart;
+			}
+
+			tb->members++;
+			*found = false;
+			entry->blockno = key;
+			entry->status = 1;
+			return entry;
+		}
+		else if (entry->status == 2)
+		{
+			/* deleted entry, unusable without checking later keys */
+		}
+		else if (entry->blockno == key)
+		{
+			Assert(entry->status == 1);
+			*found = true;
+			return entry;
+		}
+
+		curelem++;
+
+		if (curelem == startelem)
+		{
+			elog(ERROR, "full?");
+		}
+		else if (curelem >= tb->size)
+		{
+			curelem = 0;
+		}
+	}
+}
+
+
+static PagetableEntry *
+simplehash_lookup(simplehash *tb, BlockNumber key)
+{
+	uint32 hash = hash_blockno(key);
+	const uint32 startelem = hash & (tb->size - 1);
+	uint32 curelem = startelem;
+
+	while(true)
+	{
+		PagetableEntry *entry = &tb->data[curelem];
+
+		if (entry->status == 0)
+		{
+			return NULL;
+		}
+
+		if (entry->status == 1 && entry->blockno == key)
+		{
+			return entry;
+		}
+
+		curelem++;
+
+		if (curelem == startelem)
+		{
+			elog(LOG, "lookup wrapped");
+			/* XXX: shouldn't, ever happen, no? */
+			return NULL;
+		}
+		else if (curelem >= tb->size)
+		{
+			curelem = 0;
+		}
+	}
+}
+
+static bool
+simplehash_delete(simplehash *tb, BlockNumber key)
+{
+	uint32 hash = hash_blockno(key);
+	uint32 startelem = hash & (tb->size - 1);
+	uint32 curelem = startelem;
+
+	while(true)
+	{
+		PagetableEntry *entry = &tb->data[curelem];
+
+		if (entry->status == 0)
+			return false;
+
+		if (entry->status == 1
+			&& entry->blockno == key)
+		{
+			/*
+			 * XXX: Moving neighbouring entries would likely be the better
+			 * implementation.
+			 */
+			entry->status = 2;
+			tb->members--;
+			tb->deleted++;
+
+			return true;
+		}
+
+		curelem++;
+
+		if (curelem == startelem)
+		{
+			/* XXX: shouldn't, ever happen, no? */
+			elog(LOG, "delete wrapped");
+			return false;
+		}
+		else if (curelem >= tb->size)
+		{
+			curelem = 0;
+		}
+	}
+}
+
 
 /*
  * tbm_create - create an initially-empty bitmap
@@ -212,9 +445,13 @@ tbm_create(long maxbytes)
 static void
 tbm_create_pagetable(TIDBitmap *tbm)
 {
+#ifndef USE_SIMPLEHASH
 	HASHCTL		hash_ctl;
+#endif
 
 	Assert(tbm->status != TBM_HASH);
+
+#ifndef USE_SIMPLEHASH
 	Assert(tbm->pagetable == NULL);
 
 	/* Create the hashtable proper */
@@ -239,6 +476,25 @@ tbm_create_pagetable(TIDBitmap *tbm)
 		Assert(!found);
 		memcpy(page, &tbm->entry1, sizeof(PagetableEntry));
 	}
+#else
+	Assert(tbm->pagetable2 == NULL);
+	tbm->pagetable2 = simplehash_create(tbm->mcxt, 128);
+
+	if (tbm->status == TBM_ONE_PAGE)
+	{
+		PagetableEntry *page;
+		bool		found;
+		char		oldstatus;
+
+		page = simplehash_insert(tbm->pagetable2,
+								 tbm->entry1.blockno,
+								 &found);
+		Assert(!found);
+		oldstatus = page->status;
+		memcpy(page, &tbm->entry1, sizeof(PagetableEntry));
+		page->status = oldstatus;
+	}
+#endif
 
 	tbm->status = TBM_HASH;
 }
@@ -249,8 +505,13 @@ tbm_create_pagetable(TIDBitmap *tbm)
 void
 tbm_free(TIDBitmap *tbm)
 {
+#ifndef USE_SIMPLEHASH
 	if (tbm->pagetable)
 		hash_destroy(tbm->pagetable);
+#else
+	if (tbm->pagetable2)
+		simplehash_destroy(tbm->pagetable2);
+#endif
 	if (tbm->spages)
 		pfree(tbm->spages);
 	if (tbm->schunks)
@@ -357,13 +618,27 @@ tbm_union(TIDBitmap *a, const TIDBitmap *b)
 		tbm_union_page(a, &b->entry1);
 	else
 	{
+#ifndef USE_SIMPLEHASH
 		HASH_SEQ_STATUS status;
+#else
+		int			i;
+#endif
 		PagetableEntry *bpage;
 
 		Assert(b->status == TBM_HASH);
+#ifndef USE_SIMPLEHASH
 		hash_seq_init(&status, b->pagetable);
 		while ((bpage = (PagetableEntry *) hash_seq_search(&status)) != NULL)
 			tbm_union_page(a, bpage);
+#else
+		for (i = 0; i < b->pagetable2->size; i++)
+		{
+			bpage = &b->pagetable2->data[i];
+			if (bpage->status != 1)
+				continue;
+			tbm_union_page(a, bpage);
+		}
+#endif
 	}
 }
 
@@ -449,13 +724,26 @@ tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
 	}
 	else
 	{
+#ifndef USE_SIMPLEHASH
 		HASH_SEQ_STATUS status;
+#else
+		int i;
+#endif
 		PagetableEntry *apage;
 
 		Assert(a->status == TBM_HASH);
+#ifndef USE_SIMPLEHASH
 		hash_seq_init(&status, a->pagetable);
 		while ((apage = (PagetableEntry *) hash_seq_search(&status)) != NULL)
+#else
+		for (i = 0; i < a->pagetable2->size; i++)
+#endif
 		{
+#ifdef USE_SIMPLEHASH
+			apage = &a->pagetable2->data[i];
+			if (apage->status != 1)
+				continue;
+#endif
 			if (tbm_intersect_page(a, apage, b))
 			{
 				/* Page or chunk is now empty, remove it from a */
@@ -464,10 +752,15 @@ tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
 				else
 					a->npages--;
 				a->nentries--;
+#ifndef USE_SIMPLEHASH
 				if (hash_search(a->pagetable,
 								(void *) &apage->blockno,
 								HASH_REMOVE, NULL) == NULL)
 					elog(ERROR, "hash table corrupted");
+#else
+				if (!simplehash_delete(a->pagetable2, apage->blockno))
+					elog(ERROR, "hash table corrupted");
+#endif
 			}
 		}
 	}
@@ -606,7 +899,11 @@ tbm_begin_iterate(TIDBitmap *tbm)
 	 */
 	if (tbm->status == TBM_HASH && !tbm->iterating)
 	{
+#ifndef USE_SIMPLEHASH
 		HASH_SEQ_STATUS status;
+#else
+		int			i;
+#endif
 		PagetableEntry *page;
 		int			npages;
 		int			nchunks;
@@ -620,10 +917,19 @@ tbm_begin_iterate(TIDBitmap *tbm)
 				MemoryContextAlloc(tbm->mcxt,
 								   tbm->nchunks * sizeof(PagetableEntry *));
 
-		hash_seq_init(&status, tbm->pagetable);
 		npages = nchunks = 0;
+#ifndef USE_SIMPLEHASH
+		hash_seq_init(&status, tbm->pagetable);
 		while ((page = (PagetableEntry *) hash_seq_search(&status)) != NULL)
+#else
+		for (i = 0; i < tbm->pagetable2->size; i++)
+#endif
 		{
+#ifdef USE_SIMPLEHASH
+			page = &tbm->pagetable2->data[i];
+			if (page->status != 1)
+				continue;
+#endif
 			if (page->ischunk)
 				tbm->schunks[nchunks++] = page;
 			else
@@ -791,9 +1097,13 @@ tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno)
 		return page;
 	}
 
+#ifndef USE_SIMPLEHASH
 	page = (PagetableEntry *) hash_search(tbm->pagetable,
 										  (void *) &pageno,
 										  HASH_FIND, NULL);
+#else
+	page = simplehash_lookup(tbm->pagetable2, pageno);
+#endif
 	if (page == NULL)
 		return NULL;
 	if (page->ischunk)
@@ -833,16 +1143,22 @@ tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno)
 			tbm_create_pagetable(tbm);
 		}
 
+#ifndef USE_SIMPLEHASH
 		/* Look up or create an entry */
 		page = (PagetableEntry *) hash_search(tbm->pagetable,
 											  (void *) &pageno,
 											  HASH_ENTER, &found);
+#else
+		page = simplehash_insert(tbm->pagetable2, pageno, &found);
+#endif
 	}
 
 	/* Initialize it if not present before */
 	if (!found)
 	{
+		char oldstatus = page->status;
 		MemSet(page, 0, sizeof(PagetableEntry));
+		page->status = oldstatus;
 		page->blockno = pageno;
 		/* must count it too */
 		tbm->nentries++;
@@ -869,9 +1185,15 @@ tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno)
 
 	bitno = pageno % PAGES_PER_CHUNK;
 	chunk_pageno = pageno - bitno;
+
+#ifndef USE_SIMPLEHASH
 	page = (PagetableEntry *) hash_search(tbm->pagetable,
 										  (void *) &chunk_pageno,
 										  HASH_FIND, NULL);
+#else
+	page = simplehash_lookup(tbm->pagetable2, chunk_pageno);
+#endif
+
 	if (page != NULL && page->ischunk)
 	{
 		int			wordnum = WORDNUM(bitno);
@@ -912,6 +1234,7 @@ tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
 	 */
 	if (bitno != 0)
 	{
+#ifndef USE_SIMPLEHASH
 		if (hash_search(tbm->pagetable,
 						(void *) &pageno,
 						HASH_REMOVE, NULL) != NULL)
@@ -920,17 +1243,30 @@ tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
 			tbm->nentries--;
 			tbm->npages--;		/* assume it must have been non-lossy */
 		}
+#else
+		if (simplehash_delete(tbm->pagetable2, pageno))
+		{
+			/* It was present, so adjust counts */
+			tbm->nentries--;
+			tbm->npages--;		/* assume it must have been non-lossy */
+		}
+#endif
 	}
 
+#ifndef USE_SIMPLEHASH
 	/* Look up or create entry for chunk-header page */
 	page = (PagetableEntry *) hash_search(tbm->pagetable,
 										  (void *) &chunk_pageno,
 										  HASH_ENTER, &found);
-
+#else
+	page = simplehash_insert(tbm->pagetable2, chunk_pageno, &found);
+#endif
 	/* Initialize it if not present before */
 	if (!found)
 	{
+		char oldstatus = page->status;
 		MemSet(page, 0, sizeof(PagetableEntry));
+		page->status = oldstatus;
 		page->blockno = chunk_pageno;
 		page->ischunk = true;
 		/* must count it too */
@@ -939,8 +1275,10 @@ tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
 	}
 	else if (!page->ischunk)
 	{
+		char oldstatus = page->status;
 		/* chunk header page was formerly non-lossy, make it lossy */
 		MemSet(page, 0, sizeof(PagetableEntry));
+		page->status = oldstatus;
 		page->blockno = chunk_pageno;
 		page->ischunk = true;
 		/* we assume it had some tuple bit(s) set, so mark it lossy */
@@ -962,7 +1300,11 @@ tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
 static void
 tbm_lossify(TIDBitmap *tbm)
 {
+#ifndef USE_SIMPLEHASH
 	HASH_SEQ_STATUS status;
+#else
+	int i;
+#endif
 	PagetableEntry *page;
 
 	/*
@@ -977,9 +1319,18 @@ tbm_lossify(TIDBitmap *tbm)
 	Assert(!tbm->iterating);
 	Assert(tbm->status == TBM_HASH);
 
+#ifndef USE_SIMPLEHASH
 	hash_seq_init(&status, tbm->pagetable);
 	while ((page = (PagetableEntry *) hash_seq_search(&status)) != NULL)
+#else
+	for (i = 0; i < tbm->pagetable2->size; i++)
+#endif
 	{
+#ifdef USE_SIMPLEHASH
+		page = &tbm->pagetable2->data[i];
+		if (page->status != 1)
+			continue;
+#endif
 		if (page->ischunk)
 			continue;			/* already a chunk header */
 
@@ -996,7 +1347,9 @@ tbm_lossify(TIDBitmap *tbm)
 		if (tbm->nentries <= tbm->maxentries / 2)
 		{
 			/* we have done enough */
+#ifndef USE_SIMPLEHASH
 			hash_seq_term(&status);
+#endif
 			break;
 		}
 
-- 
2.8.1

