New gist vacuum.

Started by Костя Кузнецовover 10 years ago23 messages
1 attachment(s)

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Attachments:

seaq-access.patchtext/x-diff; name=seaq-access.patchDownload
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index 0e49959..229d3f4 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -619,6 +619,12 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate)
 			GISTInsertStack *item;
 			OffsetNumber downlinkoffnum;
 
+			if(GistPageIsDeleted(stack->page)) {
+				UnlockReleaseBuffer(stack->buffer);
+				xlocked = false;
+				state.stack = stack = stack->parent;
+				continue;
+			}
 			downlinkoffnum = gistchoose(state.r, stack->page, itup, giststate);
 			iid = PageGetItemId(stack->page, downlinkoffnum);
 			idxtuple = (IndexTuple) PageGetItem(stack->page, iid);
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
index ff888e2..c99ff7e 100644
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -1128,11 +1128,6 @@ gistGetMaxLevel(Relation index)
  * but will be added there the first time we visit them.
  */
 
-typedef struct
-{
-	BlockNumber childblkno;		/* hash key */
-	BlockNumber parentblkno;
-} ParentMapEntry;
 
 static void
 gistInitParentMap(GISTBuildState *buildstate)
diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c
index 7d596a3..a5fc876 100644
--- a/src/backend/access/gist/gistutil.c
+++ b/src/backend/access/gist/gistutil.c
@@ -20,6 +20,7 @@
 #include "storage/indexfsm.h"
 #include "storage/lmgr.h"
 #include "utils/builtins.h"
+#include "utils/snapmgr.h"
 
 
 /*
@@ -777,15 +778,16 @@ gistNewBuffer(Relation r)
 		if (ConditionalLockBuffer(buffer))
 		{
 			Page		page = BufferGetPage(buffer);
+			PageHeader	p = (PageHeader) page;
 
 			if (PageIsNew(page))
 				return buffer;	/* OK to use, if never initialized */
 
 			gistcheckpage(r, buffer);
 
-			if (GistPageIsDeleted(page))
+			if (GistPageIsDeleted(page) && TransactionIdPrecedes(p->pd_prune_xid, RecentGlobalDataXmin)) {
 				return buffer;	/* OK to use */
-
+			}
 			LockBuffer(buffer, GIST_UNLOCK);
 		}
 
diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index 2337dbd..12533a5 100644
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@@ -20,7 +20,8 @@
 #include "miscadmin.h"
 #include "storage/indexfsm.h"
 #include "storage/lmgr.h"
-
+#include "utils/snapmgr.h"
+#include "access/xact.h"
 
 /*
  * VACUUM cleanup: update FSM
@@ -108,6 +109,31 @@ typedef struct GistBDItem
 	struct GistBDItem *next;
 } GistBDItem;
 
+typedef struct GistBDSItem
+{
+	BlockNumber blkno;
+	bool isParent;
+	struct GistBDSItem *next;
+} GistBDSItem;
+
+typedef enum
+{
+	NOT_NEED_TO_PROCESS,	/* without action */
+	PROCESSED,				/* action is done */
+	NEED_TO_PROCESS,
+	NEED_TO_DELETE			/* */
+} GistBlockInfoFlag;
+
+typedef struct GistBlockInfo {
+	BlockNumber blkno;
+	BlockNumber parent;
+	BlockNumber leftblock;		/* block that has rightlink on blkno */
+	GistBlockInfoFlag flag;
+	//bool toDelete;				/* is need delete this block? */
+	//bool isDeleted;				/* this block was processed 	*/
+	bool hasRightLink;
+} GistBlockInfo;
+
 static void
 pushStackIfSplited(Page page, GistBDItem *stack)
 {
@@ -127,7 +153,150 @@ pushStackIfSplited(Page page, GistBDItem *stack)
 		stack->next = ptr;
 	}
 }
+static void
+gistFillBlockInfo(HTAB * map, BlockNumber blkno)
+{
+	GistBlockInfo *entry;
+	bool		found;
 
+	entry = (GistBlockInfo *) hash_search(map,
+										   (const void *) &blkno,
+										   HASH_ENTER,
+										   &found);
+	if(!found) {
+		entry->parent = InvalidBlockNumber;
+		entry->leftblock = InvalidBlockNumber;
+		entry->hasRightLink = false;
+		entry->flag = NOT_NEED_TO_PROCESS;
+		//entry->toDelete = false;
+		//entry->isDeleted = false;
+	}
+}
+
+static void
+gistMemorizeParentTab(HTAB * map, BlockNumber child, BlockNumber parent)
+{
+	GistBlockInfo *entry;
+	bool		found;
+
+	entry = (GistBlockInfo *) hash_search(map,
+										   (const void *) &child,
+										   HASH_ENTER,
+										   &found);
+	if(!found) {
+		entry->parent = InvalidBlockNumber;
+		entry->leftblock = InvalidBlockNumber;
+		entry->hasRightLink = false;
+		entry->flag = NOT_NEED_TO_PROCESS;
+	}
+	entry->parent = parent;
+}
+static BlockNumber
+gistGetParentTab(HTAB * map, BlockNumber child)
+{
+	GistBlockInfo *entry;
+	bool		found;
+
+	/* Find node buffer in hash table */
+	entry = (GistBlockInfo *) hash_search(map,
+										   (const void *) &child,
+										   HASH_FIND,
+										   &found);
+	if (!found)
+		elog(ERROR, "could not find parent of block %d in lookup table", child);
+
+	return entry->parent;
+}
+
+static BlockNumber
+gistGetLeftLink(HTAB * map, BlockNumber right)
+{
+	GistBlockInfo *entry;
+	bool		found;
+	entry = (GistBlockInfo *) hash_search(map,
+										   (const void *) &right,
+										   HASH_FIND,
+										   &found);
+	if (!found)
+		return InvalidBlockNumber;
+	if(entry->hasRightLink == false) {
+		return InvalidBlockNumber;
+	}
+	return entry->leftblock;
+}
+static void
+gistMemorizeLeftLink(HTAB * map, BlockNumber right, BlockNumber left, bool hasRightLink)
+{
+	GistBlockInfo *entry;
+	bool		found;
+	entry = (GistBlockInfo *) hash_search(map,
+										   (const void *) &right,
+										   HASH_ENTER,
+										   &found);
+	if (!found) {
+		entry->leftblock = InvalidBlockNumber;
+		entry->parent = InvalidBlockNumber;
+		entry->hasRightLink = false;
+		entry->flag = NOT_NEED_TO_PROCESS;
+	}
+
+	if(hasRightLink) {
+		entry->leftblock = left;
+		entry->hasRightLink = hasRightLink;
+	} else {
+		if(!found) {
+			entry->leftblock = left;
+			entry->hasRightLink = hasRightLink;
+		}
+	}
+
+}
+
+static bool
+gistGetDeleteLink(HTAB* map, BlockNumber blkno) {
+	GistBlockInfo *entry;
+	bool		found;
+
+	/* Find node buffer in hash table */
+	entry = (GistBlockInfo *) hash_search(map,
+										   (const void *) &blkno,
+										   HASH_FIND,
+										   &found);
+
+	if (!found)
+		return false;
+
+	return entry->flag == NEED_TO_DELETE;
+}
+static bool
+gistIsProcessed(HTAB* map, BlockNumber blkno) {
+	GistBlockInfo *entry;
+	bool		found;
+
+	/* Find node buffer in hash table */
+	entry = (GistBlockInfo *) hash_search(map,
+										   (const void *) &blkno,
+										   HASH_FIND,
+										   &found);
+
+	return entry ? entry->flag == PROCESSED: false;
+}
+static void
+gistMemorizeLinkToDelete(HTAB* map, BlockNumber blkno, GistBlockInfoFlag flag) {
+	GistBlockInfo *entry;
+	bool		found;
+	entry = (GistBlockInfo *) hash_search(map,
+										   (const void *) &blkno,
+										   HASH_ENTER,
+										   &found);
+	if (!found) {
+		entry->parent = InvalidBlockNumber;
+		entry->leftblock = InvalidBlockNumber;
+		entry->hasRightLink = false;
+		entry->flag = NOT_NEED_TO_PROCESS;
+	}
+	entry->flag = flag;
+}
 
 /*
  * Bulk deletion of all index entries pointing to a set of heap tuples and
@@ -137,21 +306,20 @@ pushStackIfSplited(Page page, GistBDItem *stack)
  *
  * Result: a palloc'd struct containing statistical info for VACUUM displays.
  */
-Datum
-gistbulkdelete(PG_FUNCTION_ARGS)
+static Datum
+gistbulkdeletelogical(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkDeleteCallback callback, void* callback_state)
 {
+	/*
 	IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0);
 	IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1);
 	IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(2);
-	void	   *callback_state = (void *) PG_GETARG_POINTER(3);
+	void	   *callback_state = (void *) PG_GETARG_POINTER(3); */
 	Relation	rel = info->index;
 	GistBDItem *stack,
 			   *ptr;
 
-	/* first time through? */
 	if (stats == NULL)
 		stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
-	/* we'll re-count the tuples each time */
 	stats->estimated_count = false;
 	stats->num_index_tuples = 0;
 
@@ -184,21 +352,12 @@ gistbulkdelete(PG_FUNCTION_ARGS)
 			page = (Page) BufferGetPage(buffer);
 			if (stack->blkno == GIST_ROOT_BLKNO && !GistPageIsLeaf(page))
 			{
-				/* only the root can become non-leaf during relock */
 				UnlockReleaseBuffer(buffer);
-				/* one more check */
 				continue;
 			}
 
-			/*
-			 * check for split proceeded after look at parent, we should check
-			 * it after relock
-			 */
 			pushStackIfSplited(page, stack);
 
-			/*
-			 * Remove deletable tuples from page
-			 */
 
 			maxoff = PageGetMaxOffsetNumber(page);
 
@@ -245,7 +404,6 @@ gistbulkdelete(PG_FUNCTION_ARGS)
 		}
 		else
 		{
-			/* check for split proceeded after look at parent */
 			pushStackIfSplited(page, stack);
 
 			maxoff = PageGetMaxOffsetNumber(page);
@@ -281,3 +439,614 @@ gistbulkdelete(PG_FUNCTION_ARGS)
 
 	PG_RETURN_POINTER(stats);
 }
+
+static void
+gistvacuumcheckrightlink(Relation rel, IndexVacuumInfo * info,
+		HTAB* infomap, BlockNumber blkno, Page page)
+{
+	/*
+	 *
+	 * if there is right link on this page but not rightlink from this page. remove rightlink from left page.
+	 * if there is right link on this page and there is a right link . right link of left page must be rightlink to rightlink of this page.
+	 * */
+
+	BlockNumber leftblkno;
+	GISTPageOpaque childopaque;
+
+	leftblkno = gistGetLeftLink(infomap, blkno);
+	if (leftblkno != InvalidBlockNumber) {
+		/*
+		 * there is a right link to child page
+		 * */
+		BlockNumber newRight = InvalidBuffer;
+		GISTPageOpaque leftOpaque;
+		Page left;
+		Buffer leftbuffer;
+		leftbuffer = ReadBufferExtended(rel, MAIN_FORKNUM, leftblkno,
+				RBM_NORMAL, info->strategy);
+		left = (Page) BufferGetPage(leftbuffer);
+
+		LockBuffer(leftbuffer, GIST_EXCLUSIVE);
+		childopaque = GistPageGetOpaque(page);
+		leftOpaque = GistPageGetOpaque(left);
+
+		while (leftOpaque->rightlink != InvalidBlockNumber
+				&& leftOpaque->rightlink != blkno) {
+			leftblkno = leftOpaque->rightlink;
+			UnlockReleaseBuffer(leftbuffer);
+			leftbuffer = ReadBufferExtended(rel, MAIN_FORKNUM, leftblkno,
+					RBM_NORMAL, info->strategy);
+			left = (Page) BufferGetPage(leftbuffer);
+
+			LockBuffer(leftbuffer, GIST_EXCLUSIVE);
+			leftOpaque = GistPageGetOpaque(left);
+
+		}
+		if (leftOpaque->rightlink == InvalidBlockNumber) {
+			/*
+			 * error!! we dont find leftpage.
+			 * */
+
+			UnlockReleaseBuffer(leftbuffer);
+			return;
+		}
+		if (childopaque->rightlink != InvalidBlockNumber) {
+			newRight = childopaque->rightlink;
+		}
+		leftOpaque->rightlink = newRight;
+
+		if (RelationNeedsWAL(rel)) {
+			XLogRecPtr recptr;
+			recptr = gistXLogRightLinkChange(rel->rd_node, leftbuffer, newRight);
+			PageSetLSN(left, recptr);
+		} else
+			PageSetLSN(left, gistGetFakeLSN(rel));
+
+		UnlockReleaseBuffer(leftbuffer);
+	}
+}
+static void
+gistvacuumrepairpage(Relation rel, IndexVacuumInfo * info, IndexBulkDeleteResult * stats,
+		IndexBulkDeleteCallback callback, void* callback_state,
+
+		HTAB* infomap, BlockNumber blkno)
+{
+	Buffer buffer;
+	Page page;
+	PageHeader header;
+	OffsetNumber maxoff, i;
+	IndexTuple idxtuple;
+	ItemId iid;
+	OffsetNumber todelete[MaxOffsetNumber];
+	int ntodelete = 0;
+	bool isNew;
+
+	buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno,
+			RBM_NORMAL, info->strategy);
+	LockBuffer(buffer, GIST_EXCLUSIVE);
+
+	gistcheckpage(rel, buffer);
+	page = (Page) BufferGetPage(buffer);
+	/*
+	 * if page is inner do nothing.
+	 * */
+	if(GistPageIsLeaf(page)) {
+		maxoff = PageGetMaxOffsetNumber(page);
+		for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) {
+			iid = PageGetItemId(page, i);
+			idxtuple = (IndexTuple) PageGetItem(page, iid);
+
+			if (callback(&(idxtuple->t_tid), callback_state)) {
+				todelete[ntodelete] = i - ntodelete;
+				ntodelete++;
+			}
+		}
+		isNew = PageIsNew(page) || PageIsEmpty(page);
+		if (ntodelete || isNew) {
+			START_CRIT_SECTION();
+
+			MarkBufferDirty(buffer);
+
+			for (i = 0; i < ntodelete; i++)
+				PageIndexTupleDelete(page, todelete[i]);
+			GistMarkTuplesDeleted(page);
+
+			if (RelationNeedsWAL(rel)) {
+				XLogRecPtr recptr;
+
+				recptr = gistXLogUpdate(rel->rd_node, buffer, todelete,
+						ntodelete,
+						NULL, 0, InvalidBuffer);
+				PageSetLSN(page, recptr);
+			} else
+				PageSetLSN(page, gistGetFakeLSN(rel));
+			END_CRIT_SECTION();
+
+			/*
+			 * ok. links has been deleted. and this in wal! now we can set deleted and repair rightlinks
+			 * */
+
+			gistvacuumcheckrightlink(rel, info, infomap, blkno, page);
+
+			/*
+			 * ok. rightlinks has been repaired.
+			 * */
+			header = (PageHeader) page;
+
+			header->pd_prune_xid = GetCurrentTransactionId();
+
+			GistPageSetDeleted(page);
+			stats->pages_deleted++;
+
+			if (RelationNeedsWAL(rel)) {
+				XLogRecPtr recptr;
+
+				recptr = gistXLogSetDeleted(rel->rd_node, buffer, header->pd_prune_xid);
+				PageSetLSN(page, recptr);
+			} else
+				PageSetLSN(page, gistGetFakeLSN(rel));
+		}
+	}
+
+	UnlockReleaseBuffer(buffer);
+}
+static void
+gistphysicalvacuum(Relation rel, IndexVacuumInfo * info, IndexBulkDeleteResult * stats,
+		IndexBulkDeleteCallback callback, void* callback_state,
+		BlockNumber npages, HTAB* infomap,
+		GistBDSItem* rescanstack, GistBDSItem* tail)
+{
+	BlockNumber blkno = GIST_ROOT_BLKNO;
+	for (; blkno < npages; blkno++) {
+		Buffer buffer;
+		Page page;
+		OffsetNumber i, maxoff;
+		IndexTuple idxtuple;
+		ItemId iid;
+		OffsetNumber todelete[MaxOffsetNumber];
+		int ntodelete = 0;
+		GISTPageOpaque opaque;
+		BlockNumber child;
+		GistBDSItem *item;
+		bool isNew;
+
+		buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
+				info->strategy);
+		LockBuffer(buffer, GIST_SHARE);
+		gistcheckpage(rel, buffer);
+		page = (Page) BufferGetPage(buffer);
+
+		isNew = PageIsNew(page) || PageIsEmpty(page);
+		opaque = GistPageGetOpaque(page);
+
+		gistFillBlockInfo(infomap, blkno);
+
+		gistMemorizeLeftLink(infomap, blkno, InvalidBlockNumber, false);
+
+		if(opaque->rightlink != InvalidBlockNumber) {
+			gistMemorizeLeftLink(infomap, opaque->rightlink, blkno, true);
+		}
+		if (GistPageIsLeaf(page)) {
+
+			LockBuffer(buffer, GIST_UNLOCK);
+			LockBuffer(buffer, GIST_EXCLUSIVE);
+
+			maxoff = PageGetMaxOffsetNumber(page);
+			for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) {
+				iid = PageGetItemId(page, i);
+				idxtuple = (IndexTuple) PageGetItem(page, iid);
+
+				if (callback(&(idxtuple->t_tid), callback_state)) {
+					todelete[ntodelete] = i - ntodelete;
+					ntodelete++;
+					stats->tuples_removed += 1;
+				} else
+					stats->num_index_tuples += 1;
+			}
+		} else {
+			/*
+			 * first scan
+			 * */
+
+			maxoff = PageGetMaxOffsetNumber(page);
+			if (blkno != GIST_ROOT_BLKNO
+					/*&& (GistFollowRight(page) || lastNSN < GistPageGetNSN(page)) */
+					&& opaque->rightlink != InvalidBlockNumber) {
+				/*
+				 * build left link map. add to rescan later.
+				 * */
+				item = (GistBDSItem *) palloc(sizeof(GistBDSItem));
+
+				item->isParent = false;
+				item->blkno = opaque->rightlink;
+				item->next = NULL;
+
+				tail->next = item;
+				tail = item;
+
+			}
+			for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) {
+				iid = PageGetItemId(page, i);
+				idxtuple = (IndexTuple) PageGetItem(page, iid);
+				child = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
+
+				gistMemorizeParentTab(infomap, child, blkno);
+
+				if (GistTupleIsInvalid(idxtuple))
+					ereport(LOG,
+							(errmsg("index \"%s\" contains an inner tuple marked as invalid", RelationGetRelationName(rel)), errdetail("This is caused by an incomplete page split at crash recovery before upgrading to PostgreSQL 9.1."), errhint("Please REINDEX it.")));
+			}
+
+		}
+		if (ntodelete || isNew) {
+			if ((maxoff == ntodelete) || isNew) {
+
+				item = (GistBDSItem *) palloc(sizeof(GistBDSItem));
+				item->isParent = true;
+				item->blkno = blkno;
+				item->next = NULL;
+
+				tail->next = item;
+				tail = item;
+
+
+				gistMemorizeLinkToDelete(infomap, blkno, NEED_TO_DELETE);
+			} else {
+				START_CRIT_SECTION();
+
+				MarkBufferDirty(buffer);
+
+				for (i = 0; i < ntodelete; i++)
+					PageIndexTupleDelete(page, todelete[i]);
+				GistMarkTuplesDeleted(page);
+
+				if (RelationNeedsWAL(rel)) {
+					XLogRecPtr recptr;
+
+					recptr = gistXLogUpdate(rel->rd_node, buffer, todelete,
+							ntodelete,
+							NULL, 0, InvalidBuffer);
+					PageSetLSN(page, recptr);
+				} else
+					PageSetLSN(page, gistGetFakeLSN(rel));
+
+				END_CRIT_SECTION();
+			}
+		}
+
+		UnlockReleaseBuffer(buffer);
+		vacuum_delay_point();
+	}
+}
+static void
+gistrescanvacuum(Relation rel, IndexVacuumInfo * info, IndexBulkDeleteResult * stats,
+		IndexBulkDeleteCallback callback, void* callback_state,
+		HTAB* infomap,
+		GistBDSItem* rescanstack, GistBDSItem* tail)
+{
+	GistBDSItem * ptr;
+	while (rescanstack != NULL) {
+		Buffer buffer;
+		Page page;
+		OffsetNumber i, maxoff;
+		IndexTuple idxtuple;
+		ItemId iid;
+		OffsetNumber todelete[MaxOffsetNumber];
+		int ntodelete = 0;
+		GISTPageOpaque opaque;
+		BlockNumber blkno, child;
+		Buffer childBuffer;
+		GistBDSItem *item;
+		bool isNew;
+		bool isProcessed;
+
+		BlockNumber setdeletedblkno[MaxOffsetNumber];
+
+		blkno = rescanstack->blkno;
+		if (gistGetParentTab(infomap, blkno) == InvalidBlockNumber && blkno != GIST_ROOT_BLKNO) {
+			/*
+			 * strange pages. it's maybe(pages without parent but is not root).
+			 * for example when last vacuum shut down and we can delete link to this page but dont set deleted
+			 * repair that pages.
+			 * how repaire: remove data if exists. rightlink repair. set-deleted
+			 */
+			gistvacuumrepairpage(rel, info, stats, callback, callback_state, infomap, blkno);
+
+			ptr = rescanstack->next;
+			pfree(rescanstack);
+			rescanstack = ptr;
+
+			vacuum_delay_point();
+			continue;
+		}
+		if (rescanstack->isParent == true) {
+			blkno = gistGetParentTab(infomap, blkno);
+		}
+
+		isProcessed = gistIsProcessed(infomap, blkno);
+
+		if(isProcessed == true || blkno == InvalidBlockNumber) {
+
+			ptr = rescanstack->next;
+			pfree(rescanstack);
+			rescanstack = ptr;
+
+			vacuum_delay_point();
+			continue;
+		}
+		buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno,
+				RBM_NORMAL, info->strategy);
+		LockBuffer(buffer, GIST_SHARE);
+
+		gistcheckpage(rel, buffer);
+		page = (Page) BufferGetPage(buffer);
+
+		opaque = GistPageGetOpaque(page);
+
+		if (blkno != GIST_ROOT_BLKNO
+				&& opaque->rightlink != InvalidBlockNumber) {
+			item = (GistBDSItem *) palloc(sizeof(GistBDSItem));
+
+			item->isParent = false;
+			item->blkno = opaque->rightlink;
+			item->next = rescanstack->next;
+
+			rescanstack->next = item;
+		}
+
+		if (GistPageIsLeaf(page)) {
+			/* usual procedure with leafs pages*/
+			LockBuffer(buffer, GIST_UNLOCK);
+			LockBuffer(buffer, GIST_EXCLUSIVE);
+
+			maxoff = PageGetMaxOffsetNumber(page);
+			for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) {
+				iid = PageGetItemId(page, i);
+				idxtuple = (IndexTuple) PageGetItem(page, iid);
+
+				if (callback(&(idxtuple->t_tid), callback_state)) {
+					todelete[ntodelete] = i - ntodelete;
+					ntodelete++;
+				}
+			}
+		} else {
+			LockBuffer(buffer, GIST_UNLOCK);
+			LockBuffer(buffer, GIST_EXCLUSIVE);
+			maxoff = PageGetMaxOffsetNumber(page);
+			for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) {
+				bool delete;
+				iid = PageGetItemId(page, i);
+				idxtuple = (IndexTuple) PageGetItem(page, iid);
+
+				child = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
+
+				delete = gistGetDeleteLink(infomap, child);
+				/*
+				 * leaf is needed to delete????
+				 * */
+				if (delete) {
+					IndexTuple idxtuplechild;
+					ItemId iidchild;
+					OffsetNumber todeletechild[MaxOffsetNumber];
+					int ntodeletechild = 0;
+					OffsetNumber j, maxoffchild;
+					Page childpage;
+					bool childIsNew;
+
+					childBuffer = ReadBufferExtended(rel, MAIN_FORKNUM, child,
+							RBM_NORMAL, info->strategy);
+
+					LockBuffer(childBuffer, GIST_EXCLUSIVE);
+
+					childpage = (Page) BufferGetPage(childBuffer);
+					childIsNew = PageIsNew(childpage) || PageIsEmpty(childpage);
+
+					if (GistPageIsLeaf(childpage)) {
+						maxoffchild = PageGetMaxOffsetNumber(childpage);
+						for (j = FirstOffsetNumber; j <= maxoffchild; j =
+								OffsetNumberNext(j)) {
+							iidchild = PageGetItemId(childpage, j);
+							idxtuplechild = (IndexTuple) PageGetItem(childpage,
+									iidchild);
+
+							if (callback(&(idxtuplechild->t_tid),
+									callback_state)) {
+								todeletechild[ntodeletechild] = j
+										- ntodeletechild;
+								ntodeletechild++;
+							}
+						}
+						if (ntodeletechild || childIsNew) {
+							START_CRIT_SECTION();
+
+							MarkBufferDirty(childBuffer);
+
+							for (j = 0; j < ntodeletechild; j++)
+								PageIndexTupleDelete(childpage,
+										todeletechild[j]);
+							GistMarkTuplesDeleted(childpage);
+
+							if (RelationNeedsWAL(rel)) {
+								XLogRecPtr recptr;
+
+								recptr = gistXLogUpdate(rel->rd_node,
+										childBuffer, todeletechild,
+										ntodeletechild,
+										NULL, 0, InvalidBuffer);
+								PageSetLSN(childpage, recptr);
+							} else
+								PageSetLSN(childpage, gistGetFakeLSN(rel));
+
+							END_CRIT_SECTION();
+
+							if ((ntodeletechild == maxoffchild) || childIsNew) {
+								/*
+								 *
+								 * if there is right link on this page but not rightlink from this page. remove rightlink from left page.
+								 * if there is right link on this page and there is a right link . right link of left page must be rightlink to rightlink of this page.
+								 * */
+								todelete[ntodelete] = i - ntodelete;
+								setdeletedblkno[ntodelete] = child;
+								ntodelete++;
+							}
+						}
+					}
+					UnlockReleaseBuffer(childBuffer);
+				}
+			}
+		}
+		isNew = PageIsNew(page) || PageIsEmpty(page);
+		if (ntodelete || isNew) {
+			if(GistPageIsLeaf(page)) {
+				item = (GistBDSItem *) palloc(sizeof(GistBDSItem));
+
+				item->isParent = false;
+				item->blkno = gistGetParentTab(infomap, blkno);
+				item->next = rescanstack->next;
+				rescanstack->next = item;
+			} else {
+				/*
+				 * delete links to pages
+				 * */
+				if(ntodelete && (ntodelete == maxoff) ) {
+					// save 1 link on inner page
+					ntodelete--;
+				}
+				START_CRIT_SECTION();
+
+				MarkBufferDirty(buffer);
+
+				for (i = 0; i < ntodelete; i++)
+					PageIndexTupleDelete(page, todelete[i]);
+				GistMarkTuplesDeleted(page);
+
+				if (RelationNeedsWAL(rel)) {
+					XLogRecPtr recptr;
+
+					recptr = gistXLogUpdate(rel->rd_node, buffer, todelete,
+							ntodelete,
+							NULL, 0, InvalidBuffer);
+					PageSetLSN(page, recptr);
+				} else
+					PageSetLSN(page, gistGetFakeLSN(rel));
+				END_CRIT_SECTION();
+
+				/*
+				 * ok. links has been deleted. and this in wal! now we can set deleted and repair rightlinks
+				 * */
+				for (i = 0; i < ntodelete; i++) {
+					Buffer childBuffertodelete;
+					Page childpagetodelete;
+					PageHeader p;
+					gistMemorizeLinkToDelete(infomap, setdeletedblkno[i], PROCESSED);
+
+					childBuffertodelete = ReadBufferExtended(rel, MAIN_FORKNUM, setdeletedblkno[i],
+							RBM_NORMAL, info->strategy);
+
+					LockBuffer(childBuffertodelete, GIST_EXCLUSIVE);
+
+					childpagetodelete = (Page) BufferGetPage(childBuffertodelete);
+
+					p = (PageHeader) childpagetodelete;
+
+					p->pd_prune_xid = GetCurrentTransactionId();
+
+					gistvacuumcheckrightlink(rel, info, infomap,
+							setdeletedblkno[i], childpagetodelete);
+					GistPageSetDeleted(childpagetodelete);
+					MarkBufferDirty(childBuffertodelete);
+					UnlockReleaseBuffer(childBuffertodelete);
+					stats->pages_deleted++;
+				}
+			}
+		}
+		gistMemorizeLinkToDelete(infomap, blkno, PROCESSED);
+		UnlockReleaseBuffer(buffer);
+
+		ptr = rescanstack->next;
+		pfree(rescanstack);
+		rescanstack = ptr;
+
+		vacuum_delay_point();
+	}
+}
+
+Datum
+gistbulkdelete(PG_FUNCTION_ARGS)
+{
+	IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0);
+	IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1);
+	IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(2);
+	void	   *callback_state = (void *) PG_GETARG_POINTER(3);
+	Relation	rel = info->index;
+	GistBDSItem *rescanstack = NULL,
+			   *tail = NULL;
+
+	int memoryneeded = 0;
+
+	BlockNumber npages;
+
+	bool needLock;
+	HTAB	   *infomap;
+	HASHCTL		hashCtl;
+
+
+	if (stats == NULL)
+		stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
+	stats->estimated_count = false;
+	stats->num_index_tuples = 0;
+
+	hashCtl.keysize = sizeof(BlockNumber);
+	hashCtl.entrysize = sizeof(GistBlockInfo);
+	hashCtl.hcxt = CurrentMemoryContext;
+
+	needLock = !RELATION_IS_LOCAL(rel);
+
+	if (needLock)
+		LockRelationForExtension(rel, ExclusiveLock);
+	npages = RelationGetNumberOfBlocks(rel);
+	if (needLock)
+		UnlockRelationForExtension(rel, ExclusiveLock);
+
+	/*
+	 * estimate memory limit
+	 * if map more than maintance_mem_work use old version of vacuum
+	 * */
+
+	memoryneeded = npages * (sizeof(GistBlockInfo));
+	if(memoryneeded > maintenance_work_mem * 1024) {
+		return gistbulkdeletelogical(info, stats, callback, callback_state);
+	}
+
+
+	infomap = hash_create("gistvacuum info map",
+										npages,
+										&hashCtl,
+									  HASH_ELEM | HASH_BLOBS | HASH_CONTEXT );
+
+	rescanstack = (GistBDSItem *) palloc(sizeof(GistBDSItem));
+
+	rescanstack->isParent = false;
+	rescanstack->blkno = GIST_ROOT_BLKNO;
+	rescanstack->next = NULL;
+	tail = rescanstack;
+
+	/*
+	 * this part of the vacuum use scan in physical order. Also this function fill hashmap `infomap`
+	 * that stores information about parent, rightlinks and etc. Pages is needed to rescan will be pushed to tail of rescanstack.
+	 * this function don't set flag gist_deleted.
+	 * */
+	gistphysicalvacuum(rel, info, stats, callback, callback_state, npages, infomap, rescanstack, tail);
+	/*
+	 * this part of the vacuum is not in physical order. It scans only pages from rescanstack.
+	 * we get page if this page is leaf we use usual procedure, but if pages is inner that we scan
+	 * it and delete links to childrens(but firstly recheck children and if all is ok).
+	 * if any pages is empty or new after processing set flag gist_delete , store prune_xid number
+	 * and etc. if all links from pages are deleted push parent of page to rescan stack to processing.
+	 * special case is when all tuples are deleted from index. in this case root block will be setted in leaf.
+	 * */
+	gistrescanvacuum(rel, info, stats, callback, callback_state, infomap, rescanstack, tail);
+
+	hash_destroy(infomap);
+	PG_RETURN_POINTER(stats);
+}
diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c
index fbdbb3c..bd47550 100644
--- a/src/backend/access/gist/gistxlog.c
+++ b/src/backend/access/gist/gistxlog.c
@@ -58,6 +58,50 @@ gistRedoClearFollowRight(XLogReaderState *record, uint8 block_id)
 		UnlockReleaseBuffer(buffer);
 }
 
+static void
+gistRedoRightlinkChange(XLogReaderState *record) {
+	XLogRecPtr	lsn = record->EndRecPtr;
+	gistxlogPageRightlinkChange *xldata = (gistxlogPageRightlinkChange *) XLogRecGetData(record);
+	Buffer		buffer;
+	Page		page;
+	BlockNumber	newright;
+	GISTPageOpaque opaque;
+
+	if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
+	{
+		newright = xldata->newRightLink;
+		page = BufferGetPage(buffer);
+		opaque = GistPageGetOpaque(page);
+		opaque->rightlink = newright;
+		/*if(newright == InvalidBlockNumber) {
+			GistClearFollowRight(page);
+		}*/
+		PageSetLSN(page, lsn);
+		MarkBufferDirty(buffer);
+	}
+}
+
+static void
+gistRedoPageSetDeleted(XLogReaderState *record) {
+	XLogRecPtr	lsn = record->EndRecPtr;
+	gistxlogPageDelete *xldata = (gistxlogPageDelete *) XLogRecGetData(record);
+	Buffer		buffer;
+	Page		page;
+	PageHeader		header;
+
+	if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
+	{
+		page = (Page) BufferGetPage(buffer);
+		header = (PageHeader) page;
+
+		header->pd_prune_xid = xldata->id;
+		GistPageSetDeleted(page);
+
+		PageSetLSN(page, lsn);
+		MarkBufferDirty(buffer);
+
+	}
+}
 /*
  * redo any page update (except page split)
  */
@@ -75,23 +119,26 @@ gistRedoPageUpdateRecord(XLogReaderState *record)
 		char	   *data;
 		Size		datalen;
 		int			ninserted = 0;
+		//BlockNumber blkno;
 
 		data = begin = XLogRecGetBlockData(record, 0, &datalen);
 
 		page = (Page) BufferGetPage(buffer);
 
+		//blkno = BufferGetBlockNumber(buffer);
 		/* Delete old tuples */
 		if (xldata->ntodelete > 0)
 		{
 			int			i;
+			//OffsetNumber max = PageGetMaxOffsetNumber(page);
 			OffsetNumber *todelete = (OffsetNumber *) data;
 
 			data += sizeof(OffsetNumber) * xldata->ntodelete;
 
 			for (i = 0; i < xldata->ntodelete; i++)
 				PageIndexTupleDelete(page, todelete[i]);
-			if (GistPageIsLeaf(page))
-				GistMarkTuplesDeleted(page);
+
+			GistMarkTuplesDeleted(page);
 		}
 
 		/* add tuples */
@@ -301,6 +348,12 @@ gist_redo(XLogReaderState *record)
 		case XLOG_GIST_CREATE_INDEX:
 			gistRedoCreateIndex(record);
 			break;
+		case XLOG_GIST_PAGE_DELETE:
+			gistRedoPageSetDeleted(record);
+			break;
+		case XLOG_GIST_RIGHTLINK_CHANGE:
+			gistRedoRightlinkChange(record);
+			break;
 		default:
 			elog(PANIC, "gist_redo: unknown op code %u", info);
 	}
@@ -377,6 +430,40 @@ gistXLogSplit(RelFileNode node, BlockNumber blkno, bool page_is_leaf,
 	return recptr;
 }
 
+
+XLogRecPtr
+gistXLogSetDeleted(RelFileNode node, Buffer buffer, TransactionId xid) {
+	gistxlogPageDelete xlrec;
+	XLogRecPtr	recptr;
+
+	xlrec.id = xid;
+
+	XLogBeginInsert();
+	XLogRegisterData((char *) &xlrec, sizeof(gistxlogPageDelete));
+
+	XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
+	/* new tuples */
+	recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_DELETE);
+	return recptr;
+}
+
+XLogRecPtr
+gistXLogRightLinkChange(RelFileNode node, Buffer buffer,
+					BlockNumber newRightLink) {
+	gistxlogPageRightlinkChange xlrec;
+	XLogRecPtr	recptr;
+
+	xlrec.newRightLink = newRightLink;
+
+	XLogBeginInsert();
+	XLogRegisterData((char *) &xlrec, sizeof(gistxlogPageRightlinkChange));
+
+	XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
+	/* new tuples */
+	recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_RIGHTLINK_CHANGE);
+	return recptr;
+}
+
 /*
  * Write XLOG record describing a page update. The update can include any
  * number of deletions and/or insertions of tuples on a single index page.
@@ -388,6 +475,7 @@ gistXLogSplit(RelFileNode node, BlockNumber blkno, bool page_is_leaf,
  * to the target buffer; they need not be stored in XLOG if XLogInsert decides
  * to log the whole buffer contents instead.
  */
+
 XLogRecPtr
 gistXLogUpdate(RelFileNode node, Buffer buffer,
 			   OffsetNumber *todelete, int ntodelete,
diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h
index 4f1a5c3..61a2e50 100644
--- a/src/include/access/gist_private.h
+++ b/src/include/access/gist_private.h
@@ -50,6 +50,11 @@ typedef struct
 	char		tupledata[FLEXIBLE_ARRAY_MEMBER];
 } GISTNodeBufferPage;
 
+typedef struct
+{
+	BlockNumber childblkno;		/* hash key */
+	BlockNumber parentblkno;
+} ParentMapEntry;
 #define BUFFER_PAGE_DATA_OFFSET MAXALIGN(offsetof(GISTNodeBufferPage, tupledata))
 /* Returns free space in node buffer page */
 #define PAGE_FREE_SPACE(nbp) (nbp->freespace)
@@ -179,7 +184,8 @@ typedef GISTScanOpaqueData *GISTScanOpaque;
 #define XLOG_GIST_PAGE_SPLIT		0x30
  /* #define XLOG_GIST_INSERT_COMPLETE	 0x40 */	/* not used anymore */
 #define XLOG_GIST_CREATE_INDEX		0x50
- /* #define XLOG_GIST_PAGE_DELETE		 0x60 */	/* not used anymore */
+#define XLOG_GIST_PAGE_DELETE		 0x60
+#define XLOG_GIST_RIGHTLINK_CHANGE		 0x70
 
 /*
  * Backup Blk 0: updated page.
@@ -216,6 +222,18 @@ typedef struct gistxlogPageSplit
 	 */
 } gistxlogPageSplit;
 
+typedef struct gistxlogPageDelete
+{
+	TransactionId id;
+} gistxlogPageDelete;
+
+typedef struct gistxlogPageRightlinkChange
+{
+	BlockNumber newRightLink;
+
+} gistxlogPageRightlinkChange;
+
+
 typedef struct gistxlogPage
 {
 	BlockNumber blkno;
@@ -453,6 +471,12 @@ extern const char *gist_identify(uint8 info);
 extern void gist_xlog_startup(void);
 extern void gist_xlog_cleanup(void);
 
+extern XLogRecPtr gistXLogSetDeleted(RelFileNode node, Buffer buffer,
+					TransactionId xid);
+
+extern XLogRecPtr gistXLogRightLinkChange(RelFileNode node, Buffer buffer,
+					BlockNumber newRightLink) ;
+
 extern XLogRecPtr gistXLogUpdate(RelFileNode node, Buffer buffer,
 			   OffsetNumber *todelete, int ntodelete,
 			   IndexTuple *itup, int ntup,
#2Michael Paquier
michael.paquier@gmail.com
In reply to: Костя Кузнецов (#1)
Re: New gist vacuum.

On Fri, Sep 11, 2015 at 7:52 AM, Костя Кузнецов <chapaev28@ya.ru> wrote:

old version:

INFO: vacuuming "public.point_tbl"
INFO: scanned index "gpointind" to remove 11184520 row versions
DETAIL: CPU 84.70s/72.26u sec elapsed 27007.14 sec.
[...]

new vacuum is about
INFO: vacuuming "public.point_tbl"
INFO: scanned index "gpointind" to remove 11184520 row versions
DETAIL: CPU 13.00s/27.57u sec elapsed 1864.22 sec.
[...]
There is a big speed up + we can reuse some pages.

Indeed. Interesting. You should definitely add your patch to the next
commit fest:
https://commitfest.postgresql.org/7/
--
Michael

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#3Jeff Janes
jeff.janes@gmail.com
In reply to: Костя Кузнецов (#1)
Re: New gist vacuum.

On Thu, Sep 10, 2015 at 3:52 PM, Костя Кузнецов <chapaev28@ya.ru> wrote:

Hello. I am student from gsoc programm.
My project is sequantial access in vacuum of gist.

New vacuum has 2 big step:
physical order scan pages and cleaning after 1 step.

1 scan - scan all pages and create information map(hashmap) and add
information to rescan stack( stack of pages that needed to rescanning

This is interesting work. I think the patch needs a rebase to the git
HEAD. There is a minor conflict in gistRedoPageUpdateRecord. It is a
little confusing because your patch introduces new code and then
immediately comments it out (using //, which is not a comment style
allowed in our style guide) and that phantom code confuses the
conflict resolution process.

There are several other places throughout the patch that use //
comment style to comment out code which the patch itself added. Those
should be removed, and the real comments should be converted to /*
this */ style.

I also got a compiler warning, it looks like a missing #include:

gistutil.c: In function 'gistNewBuffer':
gistutil.c:788:4: warning: implicit declaration of function
'TransactionIdPrecedes' [-Wimplicit-function-declaration]
if (GistPageIsDeleted(page) &&
TransactionIdPrecedes(p->pd_prune_xid, RecentGlobalDataXmin)) {
^

Also, I didn't see a check on the size of the stack. Is there an
argument that this stack will not be able to grow to be large enough
to cause trouble?

Thanks,

Jeff

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

In reply to: Jeff Janes (#3)
Re: New gist vacuum.

<div>Thank you, Jeff.</div><div>I reworking patch now. All // warning will be deleted.</div><div>About memory consumption new version will control size of stack and will operate with map of little size because i want delete old style vacuum(now ifО©╫maintenance_work_mem less than needed to build info map we use old-style vacuum with logical order).</div>

#5Alvaro Herrera
alvherre@2ndquadrant.com
In reply to: Костя Кузнецов (#4)
Re: New gist vacuum.

Костя Кузнецов wrote:

<div>Thank you, Jeff.</div><div>I reworking patch now. All // warning will be deleted.</div><div>About memory consumption new version will control size of stack and will operate with map of little size because i want delete old style vacuum(now if maintenance_work_mem less than needed to build info map we use old-style vacuum with logical order).</div>

You never got around to submitting the updated version of this patch,
and it's been a long time now, so I'm marking it as returned with
feedback for now. Please do submit a new version once you have it,
since this seems to be very useful.

--
Álvaro Herrera http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

#6Andrey Borodin
x4mmm@yandex-team.ru
In reply to: Alvaro Herrera (#5)
1 attachment(s)
Re: New gist vacuum.

Hello!

31 янв. 2016 г., в 17:18, Alvaro Herrera <alvherre@2ndquadrant.com> написал(а):

Костя Кузнецов wrote:

<div>Thank you, Jeff.</div><div>I reworking patch now. All // warning will be deleted.</div><div>About memory consumption new version will control size of stack and will operate with map of little size because i want delete old style vacuum(now if maintenance_work_mem less than needed to build info map we use old-style vacuum with logical order).</div>

You never got around to submitting the updated version of this patch,
and it's been a long time now, so I'm marking it as returned with
feedback for now. Please do submit a new version once you have it,
since this seems to be very useful.

I've rebased patch (see attachment), it seems to work. It still requires a bit of styling, docs and tests, at least.
Also, I thinks that hash table is not very good option if we have all pages there: we should either use array or do not fill table for every page.

If author and community do not object, I want to continue work on Konstantin's patch.

Best regards, Andrey Borodin.

Attachments:

0001-GiST-VACUUM-rebase.patchapplication/octet-stream; name=0001-GiST-VACUUM-rebase.patch; x-unix-mode=0644Download
From d6587b0bc88718d7049eb16be0db4d216832acce Mon Sep 17 00:00:00 2001
From: Andrey Borodin <amborodin@acm.org>
Date: Thu, 9 Nov 2017 21:03:58 +0300
Subject: [PATCH] GiST VACUUM rebase

Some forgoten markers

More forgoten markers

more rebase

some cleanup
---
 src/backend/access/gist/gist.c       |   6 +
 src/backend/access/gist/gistbuild.c  |   5 -
 src/backend/access/gist/gistutil.c   |   4 +-
 src/backend/access/gist/gistvacuum.c | 801 ++++++++++++++++++++++++++++++++++-
 src/backend/access/gist/gistxlog.c   |  88 +++-
 src/include/access/gist_private.h    |  26 +-
 src/include/access/gistxlog.h        |  24 +-
 7 files changed, 923 insertions(+), 31 deletions(-)

diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index aec174cd00..5c3b00f52f 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -693,6 +693,12 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate)
 			GISTInsertStack *item;
 			OffsetNumber downlinkoffnum;
 
+			if(GistPageIsDeleted(stack->page)) {
+				UnlockReleaseBuffer(stack->buffer);
+				xlocked = false;
+				state.stack = stack = stack->parent;
+				continue;
+			}
 			downlinkoffnum = gistchoose(state.r, stack->page, itup, giststate);
 			iid = PageGetItemId(stack->page, downlinkoffnum);
 			idxtuple = (IndexTuple) PageGetItem(stack->page, iid);
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
index b4cb364869..39a71e0fb3 100644
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -1126,11 +1126,6 @@ gistGetMaxLevel(Relation index)
  * but will be added there the first time we visit them.
  */
 
-typedef struct
-{
-	BlockNumber childblkno;		/* hash key */
-	BlockNumber parentblkno;
-} ParentMapEntry;
 
 static void
 gistInitParentMap(GISTBuildState *buildstate)
diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c
index 26d89f79ae..5943180b12 100644
--- a/src/backend/access/gist/gistutil.c
+++ b/src/backend/access/gist/gistutil.c
@@ -24,6 +24,7 @@
 #include "storage/lmgr.h"
 #include "utils/builtins.h"
 #include "utils/syscache.h"
+#include "utils/snapmgr.h"
 
 
 /*
@@ -801,13 +802,14 @@ gistNewBuffer(Relation r)
 		if (ConditionalLockBuffer(buffer))
 		{
 			Page		page = BufferGetPage(buffer);
+			PageHeader	p = (PageHeader) page;
 
 			if (PageIsNew(page))
 				return buffer;	/* OK to use, if never initialized */
 
 			gistcheckpage(r, buffer);
 
-			if (GistPageIsDeleted(page))
+			if (GistPageIsDeleted(page) && TransactionIdPrecedes(p->pd_prune_xid, RecentGlobalDataXmin))
 				return buffer;	/* OK to use */
 
 			LockBuffer(buffer, GIST_UNLOCK);
diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index 77d9d12f0b..552863adf3 100644
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@@ -20,6 +20,8 @@
 #include "miscadmin.h"
 #include "storage/indexfsm.h"
 #include "storage/lmgr.h"
+#include "utils/snapmgr.h"
+#include "access/xact.h"
 
 
 /*
@@ -106,6 +108,31 @@ typedef struct GistBDItem
 	struct GistBDItem *next;
 } GistBDItem;
 
+typedef struct GistBDSItem
+{
+	BlockNumber blkno;
+	bool isParent;
+	struct GistBDSItem *next;
+} GistBDSItem;
+
+typedef enum
+{
+	NOT_NEED_TO_PROCESS,	/* without action */
+	PROCESSED,				/* action is done */
+	NEED_TO_PROCESS,
+	NEED_TO_DELETE			/* */
+} GistBlockInfoFlag;
+
+typedef struct GistBlockInfo {
+	BlockNumber blkno;
+	BlockNumber parent;
+	BlockNumber leftblock;		/* block that has rightlink on blkno */
+	GistBlockInfoFlag flag;
+	//bool toDelete;				/* is need delete this block? */
+	//bool isDeleted;				/* this block was processed 	*/
+	bool hasRightLink;
+} GistBlockInfo;
+
 static void
 pushStackIfSplited(Page page, GistBDItem *stack)
 {
@@ -125,7 +152,150 @@ pushStackIfSplited(Page page, GistBDItem *stack)
 		stack->next = ptr;
 	}
 }
+static void
+gistFillBlockInfo(HTAB * map, BlockNumber blkno)
+{
+	GistBlockInfo *entry;
+	bool		found;
+
+	entry = (GistBlockInfo *) hash_search(map,
+										   (const void *) &blkno,
+										   HASH_ENTER,
+										   &found);
+	if(!found) {
+		entry->parent = InvalidBlockNumber;
+		entry->leftblock = InvalidBlockNumber;
+		entry->hasRightLink = false;
+		entry->flag = NOT_NEED_TO_PROCESS;
+		//entry->toDelete = false;
+		//entry->isDeleted = false;
+	}
+}
+
+static void
+gistMemorizeParentTab(HTAB * map, BlockNumber child, BlockNumber parent)
+{
+	GistBlockInfo *entry;
+	bool		found;
+
+	entry = (GistBlockInfo *) hash_search(map,
+										   (const void *) &child,
+										   HASH_ENTER,
+										   &found);
+	if(!found) {
+		entry->parent = InvalidBlockNumber;
+		entry->leftblock = InvalidBlockNumber;
+		entry->hasRightLink = false;
+		entry->flag = NOT_NEED_TO_PROCESS;
+	}
+	entry->parent = parent;
+}
+static BlockNumber
+gistGetParentTab(HTAB * map, BlockNumber child)
+{
+	GistBlockInfo *entry;
+	bool		found;
+
+	/* Find node buffer in hash table */
+	entry = (GistBlockInfo *) hash_search(map,
+										   (const void *) &child,
+										   HASH_FIND,
+										   &found);
+	if (!found)
+		elog(ERROR, "could not find parent of block %d in lookup table", child);
+
+	return entry->parent;
+}
+
+static BlockNumber
+gistGetLeftLink(HTAB * map, BlockNumber right)
+{
+	GistBlockInfo *entry;
+	bool		found;
+	entry = (GistBlockInfo *) hash_search(map,
+										   (const void *) &right,
+										   HASH_FIND,
+										   &found);
+	if (!found)
+		return InvalidBlockNumber;
+	if(entry->hasRightLink == false) {
+		return InvalidBlockNumber;
+	}
+	return entry->leftblock;
+}
+static void
+gistMemorizeLeftLink(HTAB * map, BlockNumber right, BlockNumber left, bool hasRightLink)
+{
+	GistBlockInfo *entry;
+	bool		found;
+	entry = (GistBlockInfo *) hash_search(map,
+										   (const void *) &right,
+										   HASH_ENTER,
+										   &found);
+	if (!found) {
+		entry->leftblock = InvalidBlockNumber;
+		entry->parent = InvalidBlockNumber;
+		entry->hasRightLink = false;
+		entry->flag = NOT_NEED_TO_PROCESS;
+	}
+
+	if(hasRightLink) {
+		entry->leftblock = left;
+		entry->hasRightLink = hasRightLink;
+	} else {
+		if(!found) {
+			entry->leftblock = left;
+			entry->hasRightLink = hasRightLink;
+		}
+	}
+
+}
+
+static bool
+gistGetDeleteLink(HTAB* map, BlockNumber blkno) {
+	GistBlockInfo *entry;
+	bool		found;
+
+	/* Find node buffer in hash table */
+	entry = (GistBlockInfo *) hash_search(map,
+										   (const void *) &blkno,
+										   HASH_FIND,
+										   &found);
 
+	if (!found)
+		return false;
+
+	return entry->flag == NEED_TO_DELETE;
+}
+static bool
+gistIsProcessed(HTAB* map, BlockNumber blkno) {
+	GistBlockInfo *entry;
+	bool		found;
+
+	/* Find node buffer in hash table */
+	entry = (GistBlockInfo *) hash_search(map,
+										   (const void *) &blkno,
+										   HASH_FIND,
+										   &found);
+
+	return entry ? entry->flag == PROCESSED: false;
+}
+static void
+gistMemorizeLinkToDelete(HTAB* map, BlockNumber blkno, GistBlockInfoFlag flag) {
+	GistBlockInfo *entry;
+	bool		found;
+	entry = (GistBlockInfo *) hash_search(map,
+										   (const void *) &blkno,
+										   HASH_ENTER,
+										   &found);
+	if (!found) {
+		entry->parent = InvalidBlockNumber;
+		entry->leftblock = InvalidBlockNumber;
+		entry->hasRightLink = false;
+		entry->flag = NOT_NEED_TO_PROCESS;
+	}
+	entry->flag = flag;
+}
 
 /*
  * Bulk deletion of all index entries pointing to a set of heap tuples and
@@ -135,18 +305,20 @@ pushStackIfSplited(Page page, GistBDItem *stack)
  *
  * Result: a palloc'd struct containing statistical info for VACUUM displays.
  */
-IndexBulkDeleteResult *
-gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
-			   IndexBulkDeleteCallback callback, void *callback_state)
+static IndexBulkDeleteResult *
+gistbulkdeletelogical(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkDeleteCallback callback, void* callback_state)
 {
+	/*
+	IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0);
+	IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1);
+	IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(2);
+	void	   *callback_state = (void *) PG_GETARG_POINTER(3); */
 	Relation	rel = info->index;
 	GistBDItem *stack,
 			   *ptr;
 
-	/* first time through? */
 	if (stats == NULL)
 		stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
-	/* we'll re-count the tuples each time */
 	stats->estimated_count = false;
 	stats->num_index_tuples = 0;
 
@@ -179,21 +351,12 @@ gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 			page = (Page) BufferGetPage(buffer);
 			if (stack->blkno == GIST_ROOT_BLKNO && !GistPageIsLeaf(page))
 			{
-				/* only the root can become non-leaf during relock */
 				UnlockReleaseBuffer(buffer);
-				/* one more check */
 				continue;
 			}
 
-			/*
-			 * check for split proceeded after look at parent, we should check
-			 * it after relock
-			 */
 			pushStackIfSplited(page, stack);
 
-			/*
-			 * Remove deletable tuples from page
-			 */
 
 			maxoff = PageGetMaxOffsetNumber(page);
 
@@ -237,7 +400,6 @@ gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 		}
 		else
 		{
-			/* check for split proceeded after look at parent */
 			pushStackIfSplited(page, stack);
 
 			maxoff = PageGetMaxOffsetNumber(page);
@@ -273,3 +435,612 @@ gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 
 	return stats;
 }
+
+static void
+gistvacuumcheckrightlink(Relation rel, IndexVacuumInfo * info,
+		HTAB* infomap, BlockNumber blkno, Page page)
+{
+	/*
+	 *
+	 * if there is right link on this page but not rightlink from this page. remove rightlink from left page.
+	 * if there is right link on this page and there is a right link . right link of left page must be rightlink to rightlink of this page.
+	 */
+
+	BlockNumber leftblkno;
+	GISTPageOpaque childopaque;
+
+	leftblkno = gistGetLeftLink(infomap, blkno);
+	if (leftblkno != InvalidBlockNumber) {
+		/*
+		 * there is a right link to child page
+		 */
+		BlockNumber newRight = InvalidBuffer;
+		GISTPageOpaque leftOpaque;
+		Page left;
+		Buffer leftbuffer;
+		leftbuffer = ReadBufferExtended(rel, MAIN_FORKNUM, leftblkno,
+				RBM_NORMAL, info->strategy);
+		left = (Page) BufferGetPage(leftbuffer);
+
+		LockBuffer(leftbuffer, GIST_EXCLUSIVE);
+		childopaque = GistPageGetOpaque(page);
+		leftOpaque = GistPageGetOpaque(left);
+
+		while (leftOpaque->rightlink != InvalidBlockNumber
+				&& leftOpaque->rightlink != blkno) {
+			leftblkno = leftOpaque->rightlink;
+			UnlockReleaseBuffer(leftbuffer);
+			leftbuffer = ReadBufferExtended(rel, MAIN_FORKNUM, leftblkno,
+					RBM_NORMAL, info->strategy);
+			left = (Page) BufferGetPage(leftbuffer);
+
+			LockBuffer(leftbuffer, GIST_EXCLUSIVE);
+			leftOpaque = GistPageGetOpaque(left);
+
+		}
+		if (leftOpaque->rightlink == InvalidBlockNumber) {
+			/*
+			 * error!! we dont find leftpage.
+			 */
+
+			UnlockReleaseBuffer(leftbuffer);
+			return;
+		}
+		if (childopaque->rightlink != InvalidBlockNumber) {
+			newRight = childopaque->rightlink;
+		}
+		leftOpaque->rightlink = newRight;
+
+		if (RelationNeedsWAL(rel)) {
+			XLogRecPtr recptr;
+			recptr = gistXLogRightLinkChange(rel->rd_node, leftbuffer, newRight);
+			PageSetLSN(left, recptr);
+		} else
+			PageSetLSN(left, gistGetFakeLSN(rel));
+
+		UnlockReleaseBuffer(leftbuffer);
+	}
+}
+static void
+gistvacuumrepairpage(Relation rel, IndexVacuumInfo * info, IndexBulkDeleteResult * stats,
+		IndexBulkDeleteCallback callback, void* callback_state,
+
+		HTAB* infomap, BlockNumber blkno)
+{
+	Buffer buffer;
+	Page page;
+	PageHeader header;
+	OffsetNumber maxoff, i;
+	IndexTuple idxtuple;
+	ItemId iid;
+	OffsetNumber todelete[MaxOffsetNumber];
+	int ntodelete = 0;
+	bool isNew;
+
+	buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno,
+			RBM_NORMAL, info->strategy);
+	LockBuffer(buffer, GIST_EXCLUSIVE);
+
+	gistcheckpage(rel, buffer);
+	page = (Page) BufferGetPage(buffer);
+	/*
+	 * if page is inner do nothing.
+	 */
+	if(GistPageIsLeaf(page)) {
+		maxoff = PageGetMaxOffsetNumber(page);
+		for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) {
+			iid = PageGetItemId(page, i);
+			idxtuple = (IndexTuple) PageGetItem(page, iid);
+
+			if (callback(&(idxtuple->t_tid), callback_state)) {
+				todelete[ntodelete] = i - ntodelete;
+				ntodelete++;
+			}
+		}
+		isNew = PageIsNew(page) || PageIsEmpty(page);
+		if (ntodelete || isNew) {
+			START_CRIT_SECTION();
+
+			MarkBufferDirty(buffer);
+
+			for (i = 0; i < ntodelete; i++)
+				PageIndexTupleDelete(page, todelete[i]);
+			GistMarkTuplesDeleted(page);
+
+			if (RelationNeedsWAL(rel)) {
+				XLogRecPtr recptr;
+
+				recptr = gistXLogUpdate(buffer, todelete,
+						ntodelete,
+						NULL, 0, InvalidBuffer);
+				PageSetLSN(page, recptr);
+			} else
+				PageSetLSN(page, gistGetFakeLSN(rel));
+			END_CRIT_SECTION();
+
+			/*
+			 * ok. links has been deleted. and this in wal! now we can set deleted and repair rightlinks
+			 */
+
+			gistvacuumcheckrightlink(rel, info, infomap, blkno, page);
+
+			/*
+			 * ok. rightlinks has been repaired.
+			 */
+			header = (PageHeader) page;
+
+			header->pd_prune_xid = GetCurrentTransactionId();
+
+			GistPageSetDeleted(page);
+			stats->pages_deleted++;
+
+			if (RelationNeedsWAL(rel)) {
+				XLogRecPtr recptr;
+
+				recptr = gistXLogSetDeleted(rel->rd_node, buffer, header->pd_prune_xid);
+				PageSetLSN(page, recptr);
+			} else
+				PageSetLSN(page, gistGetFakeLSN(rel));
+		}
+	}
+
+	UnlockReleaseBuffer(buffer);
+}
+static void
+gistphysicalvacuum(Relation rel, IndexVacuumInfo * info, IndexBulkDeleteResult * stats,
+		IndexBulkDeleteCallback callback, void* callback_state,
+		BlockNumber npages, HTAB* infomap,
+		GistBDSItem* rescanstack, GistBDSItem* tail)
+{
+	BlockNumber blkno = GIST_ROOT_BLKNO;
+	for (; blkno < npages; blkno++) {
+		Buffer buffer;
+		Page page;
+		OffsetNumber i, maxoff;
+		IndexTuple idxtuple;
+		ItemId iid;
+		OffsetNumber todelete[MaxOffsetNumber];
+		int ntodelete = 0;
+		GISTPageOpaque opaque;
+		BlockNumber child;
+		GistBDSItem *item;
+		bool isNew;
+
+		buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
+				info->strategy);
+		LockBuffer(buffer, GIST_SHARE);
+		gistcheckpage(rel, buffer);
+		page = (Page) BufferGetPage(buffer);
+
+		isNew = PageIsNew(page) || PageIsEmpty(page);
+		opaque = GistPageGetOpaque(page);
+
+		gistFillBlockInfo(infomap, blkno);
+
+		gistMemorizeLeftLink(infomap, blkno, InvalidBlockNumber, false);
+
+		if(opaque->rightlink != InvalidBlockNumber) {
+			gistMemorizeLeftLink(infomap, opaque->rightlink, blkno, true);
+		}
+		if (GistPageIsLeaf(page)) {
+
+			LockBuffer(buffer, GIST_UNLOCK);
+			LockBuffer(buffer, GIST_EXCLUSIVE);
+
+			maxoff = PageGetMaxOffsetNumber(page);
+			for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) {
+				iid = PageGetItemId(page, i);
+				idxtuple = (IndexTuple) PageGetItem(page, iid);
+
+				if (callback(&(idxtuple->t_tid), callback_state)) {
+					todelete[ntodelete] = i - ntodelete;
+					ntodelete++;
+					stats->tuples_removed += 1;
+				} else
+					stats->num_index_tuples += 1;
+			}
+		} else {
+			/*
+			 * first scan
+			 */
+
+			maxoff = PageGetMaxOffsetNumber(page);
+			if (blkno != GIST_ROOT_BLKNO
+					/* && (GistFollowRight(page) || lastNSN < GistPageGetNSN(page)) */
+					&& opaque->rightlink != InvalidBlockNumber) {
+				/*
+				 * build left link map. add to rescan later.
+				 */
+				item = (GistBDSItem *) palloc(sizeof(GistBDSItem));
+
+				item->isParent = false;
+				item->blkno = opaque->rightlink;
+				item->next = NULL;
+
+				tail->next = item;
+				tail = item;
+
+			}
+			for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) {
+				iid = PageGetItemId(page, i);
+				idxtuple = (IndexTuple) PageGetItem(page, iid);
+				child = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
+
+				gistMemorizeParentTab(infomap, child, blkno);
+
+				if (GistTupleIsInvalid(idxtuple))
+					ereport(LOG,
+							(errmsg("index \"%s\" contains an inner tuple marked as invalid", RelationGetRelationName(rel)), errdetail("This is caused by an incomplete page split at crash recovery before upgrading to PostgreSQL 9.1."), errhint("Please REINDEX it.")));
+			}
+
+		}
+		if (ntodelete || isNew) {
+			if ((maxoff == ntodelete) || isNew) {
+
+				item = (GistBDSItem *) palloc(sizeof(GistBDSItem));
+				item->isParent = true;
+				item->blkno = blkno;
+				item->next = NULL;
+
+				tail->next = item;
+				tail = item;
+
+
+				gistMemorizeLinkToDelete(infomap, blkno, NEED_TO_DELETE);
+			} else {
+				START_CRIT_SECTION();
+
+				MarkBufferDirty(buffer);
+
+				for (i = 0; i < ntodelete; i++)
+					PageIndexTupleDelete(page, todelete[i]);
+				GistMarkTuplesDeleted(page);
+
+				if (RelationNeedsWAL(rel)) {
+					XLogRecPtr recptr;
+
+					recptr = gistXLogUpdate(buffer, todelete,
+							ntodelete,
+							NULL, 0, InvalidBuffer);
+					PageSetLSN(page, recptr);
+				} else
+					PageSetLSN(page, gistGetFakeLSN(rel));
+
+				END_CRIT_SECTION();
+			}
+		}
+
+		UnlockReleaseBuffer(buffer);
+		vacuum_delay_point();
+	}
+}
+static void
+gistrescanvacuum(Relation rel, IndexVacuumInfo * info, IndexBulkDeleteResult * stats,
+		IndexBulkDeleteCallback callback, void* callback_state,
+		HTAB* infomap,
+		GistBDSItem* rescanstack, GistBDSItem* tail)
+{
+	GistBDSItem * ptr;
+	while (rescanstack != NULL) {
+		Buffer buffer;
+		Page page;
+		OffsetNumber i, maxoff;
+		IndexTuple idxtuple;
+		ItemId iid;
+		OffsetNumber todelete[MaxOffsetNumber];
+		int ntodelete = 0;
+		GISTPageOpaque opaque;
+		BlockNumber blkno, child;
+		Buffer childBuffer;
+		GistBDSItem *item;
+		bool isNew;
+		bool isProcessed;
+
+		BlockNumber setdeletedblkno[MaxOffsetNumber];
+
+		blkno = rescanstack->blkno;
+		if (gistGetParentTab(infomap, blkno) == InvalidBlockNumber && blkno != GIST_ROOT_BLKNO) {
+			/*
+			 * strange pages. it's maybe(pages without parent but is not root).
+			 * for example when last vacuum shut down and we can delete link to this page but dont set deleted
+			 * repair that pages.
+			 * how repaire: remove data if exists. rightlink repair. set-deleted
+			 */
+			gistvacuumrepairpage(rel, info, stats, callback, callback_state, infomap, blkno);
+
+			ptr = rescanstack->next;
+			pfree(rescanstack);
+			rescanstack = ptr;
+
+			vacuum_delay_point();
+			continue;
+		}
+		if (rescanstack->isParent == true) {
+			blkno = gistGetParentTab(infomap, blkno);
+		}
+
+		isProcessed = gistIsProcessed(infomap, blkno);
+
+		if(isProcessed == true || blkno == InvalidBlockNumber) {
+
+			ptr = rescanstack->next;
+			pfree(rescanstack);
+			rescanstack = ptr;
+
+			vacuum_delay_point();
+			continue;
+		}
+		buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno,
+				RBM_NORMAL, info->strategy);
+		LockBuffer(buffer, GIST_SHARE);
+
+		gistcheckpage(rel, buffer);
+		page = (Page) BufferGetPage(buffer);
+
+		opaque = GistPageGetOpaque(page);
+
+		if (blkno != GIST_ROOT_BLKNO
+				&& opaque->rightlink != InvalidBlockNumber) {
+			item = (GistBDSItem *) palloc(sizeof(GistBDSItem));
+
+			item->isParent = false;
+			item->blkno = opaque->rightlink;
+			item->next = rescanstack->next;
+
+			rescanstack->next = item;
+		}
+
+		if (GistPageIsLeaf(page)) {
+			/* usual procedure with leafs pages*/
+			LockBuffer(buffer, GIST_UNLOCK);
+			LockBuffer(buffer, GIST_EXCLUSIVE);
+
+			maxoff = PageGetMaxOffsetNumber(page);
+			for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) {
+				iid = PageGetItemId(page, i);
+				idxtuple = (IndexTuple) PageGetItem(page, iid);
+
+				if (callback(&(idxtuple->t_tid), callback_state)) {
+					todelete[ntodelete] = i - ntodelete;
+					ntodelete++;
+				}
+			}
+		} else {
+			LockBuffer(buffer, GIST_UNLOCK);
+			LockBuffer(buffer, GIST_EXCLUSIVE);
+			maxoff = PageGetMaxOffsetNumber(page);
+			for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) {
+				bool delete;
+				iid = PageGetItemId(page, i);
+				idxtuple = (IndexTuple) PageGetItem(page, iid);
+
+				child = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
+
+				delete = gistGetDeleteLink(infomap, child);
+				/*
+				 * leaf is needed to delete????
+				 */
+				if (delete) {
+					IndexTuple idxtuplechild;
+					ItemId iidchild;
+					OffsetNumber todeletechild[MaxOffsetNumber];
+					int ntodeletechild = 0;
+					OffsetNumber j, maxoffchild;
+					Page childpage;
+					bool childIsNew;
+
+					childBuffer = ReadBufferExtended(rel, MAIN_FORKNUM, child,
+							RBM_NORMAL, info->strategy);
+
+					LockBuffer(childBuffer, GIST_EXCLUSIVE);
+
+					childpage = (Page) BufferGetPage(childBuffer);
+					childIsNew = PageIsNew(childpage) || PageIsEmpty(childpage);
+
+					if (GistPageIsLeaf(childpage)) {
+						maxoffchild = PageGetMaxOffsetNumber(childpage);
+						for (j = FirstOffsetNumber; j <= maxoffchild; j =
+								OffsetNumberNext(j)) {
+							iidchild = PageGetItemId(childpage, j);
+							idxtuplechild = (IndexTuple) PageGetItem(childpage,
+									iidchild);
+
+							if (callback(&(idxtuplechild->t_tid),
+									callback_state)) {
+								todeletechild[ntodeletechild] = j
+										- ntodeletechild;
+								ntodeletechild++;
+							}
+						}
+						if (ntodeletechild || childIsNew) {
+							START_CRIT_SECTION();
+
+							MarkBufferDirty(childBuffer);
+
+							for (j = 0; j < ntodeletechild; j++)
+								PageIndexTupleDelete(childpage,
+										todeletechild[j]);
+							GistMarkTuplesDeleted(childpage);
+
+							if (RelationNeedsWAL(rel)) {
+								XLogRecPtr recptr;
+
+								recptr = gistXLogUpdate(
+										childBuffer, todeletechild,
+										ntodeletechild,
+										NULL, 0, InvalidBuffer);
+								PageSetLSN(childpage, recptr);
+							} else
+								PageSetLSN(childpage, gistGetFakeLSN(rel));
+
+							END_CRIT_SECTION();
+
+							if ((ntodeletechild == maxoffchild) || childIsNew) {
+								/*
+								 *
+								 * if there is right link on this page but not rightlink from this page. remove rightlink from left page.
+								 * if there is right link on this page and there is a right link . right link of left page must be rightlink to rightlink of this page.
+								 */
+								todelete[ntodelete] = i - ntodelete;
+								setdeletedblkno[ntodelete] = child;
+								ntodelete++;
+							}
+						}
+					}
+					UnlockReleaseBuffer(childBuffer);
+				}
+			}
+		}
+		isNew = PageIsNew(page) || PageIsEmpty(page);
+		if (ntodelete || isNew) {
+			if(GistPageIsLeaf(page)) {
+				item = (GistBDSItem *) palloc(sizeof(GistBDSItem));
+
+				item->isParent = false;
+				item->blkno = gistGetParentTab(infomap, blkno);
+				item->next = rescanstack->next;
+				rescanstack->next = item;
+			} else {
+				/*
+				 * delete links to pages
+				 */
+				if(ntodelete && (ntodelete == maxoff) ) {
+					/* save 1 link on inner page */
+					ntodelete--;
+				}
+				START_CRIT_SECTION();
+
+				MarkBufferDirty(buffer);
+
+				for (i = 0; i < ntodelete; i++)
+					PageIndexTupleDelete(page, todelete[i]);
+				GistMarkTuplesDeleted(page);
+
+				if (RelationNeedsWAL(rel)) {
+					XLogRecPtr recptr;
+
+					recptr = gistXLogUpdate(buffer, todelete,
+							ntodelete,
+							NULL, 0, InvalidBuffer);
+					PageSetLSN(page, recptr);
+				} else
+					PageSetLSN(page, gistGetFakeLSN(rel));
+				END_CRIT_SECTION();
+
+				/*
+				 * ok. links has been deleted. and this in wal! now we can set deleted and repair rightlinks
+				 */
+				for (i = 0; i < ntodelete; i++) {
+					Buffer childBuffertodelete;
+					Page childpagetodelete;
+					PageHeader p;
+					gistMemorizeLinkToDelete(infomap, setdeletedblkno[i], PROCESSED);
+
+					childBuffertodelete = ReadBufferExtended(rel, MAIN_FORKNUM, setdeletedblkno[i],
+							RBM_NORMAL, info->strategy);
+
+					LockBuffer(childBuffertodelete, GIST_EXCLUSIVE);
+
+					childpagetodelete = (Page) BufferGetPage(childBuffertodelete);
+
+					p = (PageHeader) childpagetodelete;
+
+					p->pd_prune_xid = GetCurrentTransactionId();
+
+					gistvacuumcheckrightlink(rel, info, infomap,
+							setdeletedblkno[i], childpagetodelete);
+					GistPageSetDeleted(childpagetodelete);
+					MarkBufferDirty(childBuffertodelete);
+					UnlockReleaseBuffer(childBuffertodelete);
+					stats->pages_deleted++;
+				}
+			}
+		}
+		gistMemorizeLinkToDelete(infomap, blkno, PROCESSED);
+		UnlockReleaseBuffer(buffer);
+
+		ptr = rescanstack->next;
+		pfree(rescanstack);
+		rescanstack = ptr;
+
+		vacuum_delay_point();
+	}
+}
+
+IndexBulkDeleteResult *gistbulkdelete(IndexVacuumInfo *info,
+	IndexBulkDeleteResult *stats,
+	IndexBulkDeleteCallback callback,
+	void *callback_state)
+{
+	Relation	rel = info->index;
+	GistBDSItem *rescanstack = NULL,
+			   *tail = NULL;
+
+	int memoryneeded = 0;
+
+	BlockNumber npages;
+
+	bool needLock;
+	HTAB	   *infomap;
+	HASHCTL		hashCtl;
+
+
+	if (stats == NULL)
+		stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
+	stats->estimated_count = false;
+	stats->num_index_tuples = 0;
+
+	hashCtl.keysize = sizeof(BlockNumber);
+	hashCtl.entrysize = sizeof(GistBlockInfo);
+	hashCtl.hcxt = CurrentMemoryContext;
+
+	needLock = !RELATION_IS_LOCAL(rel);
+
+	if (needLock)
+		LockRelationForExtension(rel, ExclusiveLock);
+	npages = RelationGetNumberOfBlocks(rel);
+	if (needLock)
+		UnlockRelationForExtension(rel, ExclusiveLock);
+
+	/*
+	 * estimate memory limit
+	 * if map more than maintance_mem_work use old version of vacuum
+	 */
+
+	memoryneeded = npages * (sizeof(GistBlockInfo));
+	if(memoryneeded > maintenance_work_mem * 1024) {
+		return gistbulkdeletelogical(info, stats, callback, callback_state);
+	}
+
+
+	infomap = hash_create("gistvacuum info map",
+										npages,
+										&hashCtl,
+									  HASH_ELEM | HASH_BLOBS | HASH_CONTEXT );
+
+	rescanstack = (GistBDSItem *) palloc(sizeof(GistBDSItem));
+
+	rescanstack->isParent = false;
+	rescanstack->blkno = GIST_ROOT_BLKNO;
+	rescanstack->next = NULL;
+	tail = rescanstack;
+
+	/*
+	 * this part of the vacuum use scan in physical order. Also this function fill hashmap `infomap`
+	 * that stores information about parent, rightlinks and etc. Pages is needed to rescan will be pushed to tail of rescanstack.
+	 * this function don't set flag gist_deleted.
+	 */
+	gistphysicalvacuum(rel, info, stats, callback, callback_state, npages, infomap, rescanstack, tail);
+	/*
+	 * this part of the vacuum is not in physical order. It scans only pages from rescanstack.
+	 * we get page if this page is leaf we use usual procedure, but if pages is inner that we scan
+	 * it and delete links to childrens(but firstly recheck children and if all is ok).
+	 * if any pages is empty or new after processing set flag gist_delete , store prune_xid number
+	 * and etc. if all links from pages are deleted push parent of page to rescan stack to processing.
+	 * special case is when all tuples are deleted from index. in this case root block will be setted in leaf.
+	 */
+	gistrescanvacuum(rel, info, stats, callback, callback_state, infomap, rescanstack, tail);
+
+	hash_destroy(infomap);
+	return stats;
+}
diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c
index 7fd91ce640..7b82c02e15 100644
--- a/src/backend/access/gist/gistxlog.c
+++ b/src/backend/access/gist/gistxlog.c
@@ -60,6 +60,50 @@ gistRedoClearFollowRight(XLogReaderState *record, uint8 block_id)
 		UnlockReleaseBuffer(buffer);
 }
 
+static void
+gistRedoRightlinkChange(XLogReaderState *record) {
+	XLogRecPtr	lsn = record->EndRecPtr;
+	gistxlogPageRightlinkChange *xldata = (gistxlogPageRightlinkChange *) XLogRecGetData(record);
+	Buffer		buffer;
+	Page		page;
+	BlockNumber	newright;
+	GISTPageOpaque opaque;
+
+	if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
+	{
+		newright = xldata->newRightLink;
+		page = BufferGetPage(buffer);
+		opaque = GistPageGetOpaque(page);
+		opaque->rightlink = newright;
+		/*if(newright == InvalidBlockNumber) {
+			GistClearFollowRight(page);
+		}*/
+		PageSetLSN(page, lsn);
+		MarkBufferDirty(buffer);
+	}
+}
+
+static void
+gistRedoPageSetDeleted(XLogReaderState *record) {
+	XLogRecPtr	lsn = record->EndRecPtr;
+	gistxlogPageDelete *xldata = (gistxlogPageDelete *) XLogRecGetData(record);
+	Buffer		buffer;
+	Page		page;
+	PageHeader		header;
+
+	if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
+	{
+		page = (Page) BufferGetPage(buffer);
+		header = (PageHeader) page;
+
+		header->pd_prune_xid = xldata->id;
+		GistPageSetDeleted(page);
+
+		PageSetLSN(page, lsn);
+		MarkBufferDirty(buffer);
+
+	}
+}
 /*
  * redo any page update (except page split)
  */
@@ -112,8 +156,8 @@ gistRedoPageUpdateRecord(XLogReaderState *record)
 			data += sizeof(OffsetNumber) * xldata->ntodelete;
 
 			PageIndexMultiDelete(page, todelete, xldata->ntodelete);
-			if (GistPageIsLeaf(page))
-				GistMarkTuplesDeleted(page);
+			
+			GistMarkTuplesDeleted(page);
 		}
 
 		/* Add new tuples if any */
@@ -324,6 +368,12 @@ gist_redo(XLogReaderState *record)
 		case XLOG_GIST_CREATE_INDEX:
 			gistRedoCreateIndex(record);
 			break;
+		case XLOG_GIST_PAGE_DELETE:
+			gistRedoPageSetDeleted(record);
+			break;
+		case XLOG_GIST_RIGHTLINK_CHANGE:
+			gistRedoRightlinkChange(record);
+			break;
 		default:
 			elog(PANIC, "gist_redo: unknown op code %u", info);
 	}
@@ -442,6 +492,40 @@ gistXLogSplit(bool page_is_leaf,
 	return recptr;
 }
 
+
+XLogRecPtr
+gistXLogSetDeleted(RelFileNode node, Buffer buffer, TransactionId xid) {
+	gistxlogPageDelete xlrec;
+	XLogRecPtr	recptr;
+
+	xlrec.id = xid;
+
+	XLogBeginInsert();
+	XLogRegisterData((char *) &xlrec, sizeof(gistxlogPageDelete));
+
+	XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
+	/* new tuples */
+	recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_DELETE);
+	return recptr;
+}
+
+XLogRecPtr
+gistXLogRightLinkChange(RelFileNode node, Buffer buffer,
+					BlockNumber newRightLink) {
+	gistxlogPageRightlinkChange xlrec;
+	XLogRecPtr	recptr;
+
+	xlrec.newRightLink = newRightLink;
+
+	XLogBeginInsert();
+	XLogRegisterData((char *) &xlrec, sizeof(gistxlogPageRightlinkChange));
+
+	XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
+	/* new tuples */
+	recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_RIGHTLINK_CHANGE);
+	return recptr;
+}
+
 /*
  * Write XLOG record describing a page update. The update can include any
  * number of deletions and/or insertions of tuples on a single index page.
diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h
index bfef2df420..8cc5909dc1 100644
--- a/src/include/access/gist_private.h
+++ b/src/include/access/gist_private.h
@@ -16,6 +16,7 @@
 
 #include "access/amapi.h"
 #include "access/gist.h"
+#include "access/gistxlog.h"
 #include "access/itup.h"
 #include "fmgr.h"
 #include "lib/pairingheap.h"
@@ -51,6 +52,11 @@ typedef struct
 	char		tupledata[FLEXIBLE_ARRAY_MEMBER];
 } GISTNodeBufferPage;
 
+typedef struct
+{
+	BlockNumber childblkno;		/* hash key */
+	BlockNumber parentblkno;
+} ParentMapEntry;
 #define BUFFER_PAGE_DATA_OFFSET MAXALIGN(offsetof(GISTNodeBufferPage, tupledata))
 /* Returns free space in node buffer page */
 #define PAGE_FREE_SPACE(nbp) (nbp->freespace)
@@ -176,13 +182,6 @@ typedef struct GISTScanOpaqueData
 
 typedef GISTScanOpaqueData *GISTScanOpaque;
 
-/* despite the name, gistxlogPage is not part of any xlog record */
-typedef struct gistxlogPage
-{
-	BlockNumber blkno;
-	int			num;			/* number of index tuples following */
-} gistxlogPage;
-
 /* SplitedPageLayout - gistSplit function result */
 typedef struct SplitedPageLayout
 {
@@ -409,6 +408,19 @@ extern bool gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
 extern SplitedPageLayout *gistSplit(Relation r, Page page, IndexTuple *itup,
 		  int len, GISTSTATE *giststate);
 
+/* gistxlog.c */
+extern void gist_redo(XLogReaderState *record);
+extern void gist_desc(StringInfo buf, XLogReaderState *record);
+extern const char *gist_identify(uint8 info);
+extern void gist_xlog_startup(void);
+extern void gist_xlog_cleanup(void);
+
+extern XLogRecPtr gistXLogSetDeleted(RelFileNode node, Buffer buffer,
+					TransactionId xid);
+
+extern XLogRecPtr gistXLogRightLinkChange(RelFileNode node, Buffer buffer,
+					BlockNumber newRightLink) ;
+
 extern XLogRecPtr gistXLogUpdate(Buffer buffer,
 			   OffsetNumber *todelete, int ntodelete,
 			   IndexTuple *itup, int ntup,
diff --git a/src/include/access/gistxlog.h b/src/include/access/gistxlog.h
index 3b126eca2a..d91c8b7e54 100644
--- a/src/include/access/gistxlog.h
+++ b/src/include/access/gistxlog.h
@@ -17,12 +17,15 @@
 #include "access/xlogreader.h"
 #include "lib/stringinfo.h"
 
+/* XLog stuff */
+
 #define XLOG_GIST_PAGE_UPDATE		0x00
  /* #define XLOG_GIST_NEW_ROOT			 0x20 */	/* not used anymore */
 #define XLOG_GIST_PAGE_SPLIT		0x30
  /* #define XLOG_GIST_INSERT_COMPLETE	 0x40 */	/* not used anymore */
 #define XLOG_GIST_CREATE_INDEX		0x50
- /* #define XLOG_GIST_PAGE_DELETE		 0x60 */	/* not used anymore */
+#define XLOG_GIST_PAGE_DELETE		 0x60
+#define XLOG_GIST_RIGHTLINK_CHANGE		 0x70
 
 /*
  * Backup Blk 0: updated page.
@@ -59,6 +62,25 @@ typedef struct gistxlogPageSplit
 	 */
 } gistxlogPageSplit;
 
+typedef struct gistxlogPageDelete
+{
+   TransactionId id;
+} gistxlogPageDelete;
+
+typedef struct gistxlogPageRightlinkChange
+{
+   BlockNumber newRightLink;
+
+} gistxlogPageRightlinkChange;
+
+
+/* despite the name, gistxlogPage is not part of any xlog record */
+typedef struct gistxlogPage
+{
+   BlockNumber blkno;
+   int			num;			/* number of index tuples following */
+} gistxlogPage;
+
 extern void gist_redo(XLogReaderState *record);
 extern void gist_desc(StringInfo buf, XLogReaderState *record);
 extern const char *gist_identify(uint8 info);
-- 
2.13.5 (Apple Git-94)

#7Andrey Borodin
x4mmm@yandex-team.ru
In reply to: Andrey Borodin (#6)
1 attachment(s)
Re: New gist vacuum.

Hi hackers!

Here is the patch that deletes pages during GiST VACUUM.

12 нояб. 2017 г., в 23:20, Andrey Borodin <x4mmm@yandex-team.ru> написал(а):

If author and community do not object, I want to continue work on Konstantin's patch.

==Purpose==
Long story short, some time ago Konstantin Kuznetsov hacked out a patch that added GiST scan with physical order of scan.
This scan is using a lot of memory to build map of whole GiST graph. If there is not enough maintenance memory, patch had the fallback to old GiST VACUUM.
New behavior was deleting pages while old (still used) was not.

I've rebased patch, fixed some bugs and decided that it solves too much in a single step.

Here is the patch, which adds functionality of GiST page deletes.
If this is committed, porting physical scan code will be much easier.

==What is changed==
When GiST VACUUM scans graph for removed tuples, it remembers internal pages that are referencing completely empty leaf pages.
Then in additional step, these pages are rescanned to delete references and mark leaf pages as free.

==Limitations==
At least one reference on each internal pages is left undeleted to preserve balancing of the tree.
Pages that has FOLLOW-RIGHT flag also are not deleted, even if empty.

Thank you for your attention, any thoughts are welcome.

Best regards, Andrey Borodin.

Attachments:

0001-Delete-pages-during-GiST-VACCUM.patchapplication/octet-stream; name=0001-Delete-pages-during-GiST-VACCUM.patch; x-unix-mode=0644Download
From 3a6d6b107043b130faa44615e0ad67e1e87b5217 Mon Sep 17 00:00:00 2001
From: Andrey Borodin <amborodin@acm.org>
Date: Tue, 19 Dec 2017 15:38:32 +0500
Subject: [PATCH] Delete pages during GiST VACCUM

---
 src/backend/access/gist/gist.c       |   7 +++
 src/backend/access/gist/gistbuild.c  |   5 --
 src/backend/access/gist/gistutil.c   |   4 +-
 src/backend/access/gist/gistvacuum.c | 117 +++++++++++++++++++++++++++++++++--
 src/backend/access/gist/gistxlog.c   |  47 +++++++++++++-
 src/include/access/gist_private.h    |  23 ++++---
 src/include/access/gistxlog.h        |  16 ++++-
 src/test/regress/expected/gist.out   |   4 +-
 src/test/regress/sql/gist.sql        |   4 +-
 9 files changed, 201 insertions(+), 26 deletions(-)

diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index cf4b319b4e..2dc6bedf8f 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -693,6 +693,13 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate)
 			GISTInsertStack *item;
 			OffsetNumber downlinkoffnum;
 
+			if(GistPageIsDeleted(stack->page))
+			{
+				UnlockReleaseBuffer(stack->buffer);
+				xlocked = false;
+				state.stack = stack = stack->parent;
+				continue;
+			}
 			downlinkoffnum = gistchoose(state.r, stack->page, itup, giststate);
 			iid = PageGetItemId(stack->page, downlinkoffnum);
 			idxtuple = (IndexTuple) PageGetItem(stack->page, iid);
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
index 2415f00e06..fee533966f 100644
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -1126,11 +1126,6 @@ gistGetMaxLevel(Relation index)
  * but will be added there the first time we visit them.
  */
 
-typedef struct
-{
-	BlockNumber childblkno;		/* hash key */
-	BlockNumber parentblkno;
-} ParentMapEntry;
 
 static void
 gistInitParentMap(GISTBuildState *buildstate)
diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c
index d8d1c0acfc..3f6fff011b 100644
--- a/src/backend/access/gist/gistutil.c
+++ b/src/backend/access/gist/gistutil.c
@@ -24,6 +24,7 @@
 #include "storage/lmgr.h"
 #include "utils/builtins.h"
 #include "utils/syscache.h"
+#include "utils/snapmgr.h"
 
 
 /*
@@ -801,13 +802,14 @@ gistNewBuffer(Relation r)
 		if (ConditionalLockBuffer(buffer))
 		{
 			Page		page = BufferGetPage(buffer);
+			PageHeader	p = (PageHeader) page;
 
 			if (PageIsNew(page))
 				return buffer;	/* OK to use, if never initialized */
 
 			gistcheckpage(r, buffer);
 
-			if (GistPageIsDeleted(page))
+			if (GistPageIsDeleted(page) && TransactionIdPrecedes(p->pd_prune_xid, RecentGlobalDataXmin))
 				return buffer;	/* OK to use */
 
 			LockBuffer(buffer, GIST_UNLOCK);
diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index 77d9d12f0b..30f968ff31 100644
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@@ -20,6 +20,8 @@
 #include "miscadmin.h"
 #include "storage/indexfsm.h"
 #include "storage/lmgr.h"
+#include "utils/snapmgr.h"
+#include "access/xact.h"
 
 
 /*
@@ -126,7 +128,6 @@ pushStackIfSplited(Page page, GistBDItem *stack)
 	}
 }
 
-
 /*
  * Bulk deletion of all index entries pointing to a set of heap tuples and
  * check invalid tuples left after upgrade.
@@ -136,12 +137,14 @@ pushStackIfSplited(Page page, GistBDItem *stack)
  * Result: a palloc'd struct containing statistical info for VACUUM displays.
  */
 IndexBulkDeleteResult *
-gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
-			   IndexBulkDeleteCallback callback, void *callback_state)
+gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkDeleteCallback callback, void* callback_state)
 {
 	Relation	rel = info->index;
 	GistBDItem *stack,
 			   *ptr;
+	BlockNumber recentParent = InvalidBlockNumber;
+	List	   *rescanList = NULL;
+	ListCell   *cell;
 
 	/* first time through? */
 	if (stats == NULL)
@@ -234,9 +237,16 @@ gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 				END_CRIT_SECTION();
 			}
 
+			if (ntodelete == maxoff && recentParent!=InvalidBlockNumber &&
+				(rescanList == NULL || (BlockNumber)llast_int(rescanList) != recentParent))
+			{
+				/* This page is a candidate to be deleted. Remember it's parent to rescan it later with xlock */
+				rescanList = lappend_int(rescanList, recentParent);
+			}
 		}
 		else
 		{
+			recentParent = stack->blkno;
 			/* check for split proceeded after look at parent */
 			pushStackIfSplited(page, stack);
 
@@ -271,5 +281,104 @@ gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 		vacuum_delay_point();
 	}
 
+	/* rescan inner pages that had empty child pages */
+	foreach(cell,rescanList)
+	{
+		Buffer		buffer;
+		Page		page;
+		OffsetNumber i,
+					maxoff;
+		IndexTuple	idxtuple;
+		ItemId		iid;
+		OffsetNumber todelete[MaxOffsetNumber];
+		Buffer		buftodelete[MaxOffsetNumber];
+		int			ntodelete = 0;
+
+		buffer = ReadBufferExtended(rel, MAIN_FORKNUM, (BlockNumber)lfirst_int(cell),
+									RBM_NORMAL, info->strategy);
+		LockBuffer(buffer, GIST_EXCLUSIVE);
+		gistcheckpage(rel, buffer);
+		page = (Page) BufferGetPage(buffer);
+
+		Assert(!GistPageIsLeaf(page));
+
+		maxoff = PageGetMaxOffsetNumber(page);
+
+		for (i = OffsetNumberNext(FirstOffsetNumber); i <= maxoff; i = OffsetNumberNext(i))
+			{
+				Buffer		leafBuffer;
+				Page		leafPage;
+
+				iid = PageGetItemId(page, i);
+				idxtuple = (IndexTuple) PageGetItem(page, iid);
+
+				leafBuffer = ReadBufferExtended(rel, MAIN_FORKNUM, ItemPointerGetBlockNumber(&(idxtuple->t_tid)),
+									RBM_NORMAL, info->strategy);
+				LockBuffer(leafBuffer, GIST_EXCLUSIVE);
+				gistcheckpage(rel, leafBuffer);
+				leafPage = (Page) BufferGetPage(leafBuffer);
+				Assert(GistPageIsLeaf(leafPage));
+
+				if (PageGetMaxOffsetNumber(leafPage) == InvalidOffsetNumber /* Nothing left to split */
+					&& !(GistFollowRight(leafPage) || GistPageGetNSN(page) < GistPageGetNSN(leafPage)) /* No follow-right */
+					&& ntodelete < maxoff-1) /* We must keep at leaf one leaf page per each */
+				{
+					buftodelete[ntodelete] = leafBuffer;
+					todelete[ntodelete++] = i;
+				}
+				else
+					UnlockReleaseBuffer(leafBuffer);
+			}
+
+
+		if (ntodelete)
+		{
+			/* Drop references from internal page */
+			TransactionId txid = GetCurrentTransactionId();
+			START_CRIT_SECTION();
+
+			MarkBufferDirty(buffer);
+				PageIndexMultiDelete(page, todelete, ntodelete);
+
+			if (RelationNeedsWAL(rel))
+			{
+				XLogRecPtr	recptr;
+
+				recptr = gistXLogUpdate(buffer, todelete, ntodelete, NULL, 0, InvalidBuffer);
+					PageSetLSN(page, recptr);
+			}
+			else
+				PageSetLSN(page, gistGetFakeLSN(rel));
+
+			/* Mark pages as deleted */
+			for (i = 0; i < ntodelete; i++)
+			{
+				Page		leafPage = (Page)BufferGetPage(buftodelete[i]);
+				PageHeader	header = (PageHeader)leafPage;
+
+				header->pd_prune_xid = txid;
+
+				GistPageSetDeleted(leafPage);
+				MarkBufferDirty(buftodelete[i]);
+				stats->pages_deleted++;
+
+				if (RelationNeedsWAL(rel))
+				{
+					XLogRecPtr recptr = gistXLogSetDeleted(rel->rd_node, buftodelete[i], header->pd_prune_xid);
+					PageSetLSN(leafPage, recptr);
+				}
+				else
+					PageSetLSN(leafPage, gistGetFakeLSN(rel));
+
+				UnlockReleaseBuffer(buftodelete[i]);
+			}
+			END_CRIT_SECTION();
+		}
+
+		UnlockReleaseBuffer(buffer);
+	}
+
+	list_free(rescanList);
+
 	return stats;
-}
+}
\ No newline at end of file
diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c
index 7fd91ce640..0a008a4a77 100644
--- a/src/backend/access/gist/gistxlog.c
+++ b/src/backend/access/gist/gistxlog.c
@@ -60,6 +60,29 @@ gistRedoClearFollowRight(XLogReaderState *record, uint8 block_id)
 		UnlockReleaseBuffer(buffer);
 }
 
+static void
+gistRedoPageSetDeleted(XLogReaderState *record)
+{
+	XLogRecPtr	lsn = record->EndRecPtr;
+	gistxlogPageDelete *xldata = (gistxlogPageDelete *) XLogRecGetData(record);
+	Buffer		buffer;
+	Page		page;
+	PageHeader		header;
+
+	if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
+	{
+		page = (Page) BufferGetPage(buffer);
+		header = (PageHeader) page;
+
+		header->pd_prune_xid = xldata->id;
+		GistPageSetDeleted(page);
+
+		PageSetLSN(page, lsn);
+		MarkBufferDirty(buffer);
+
+		UnlockReleaseBuffer(buffer);
+	}
+}
 /*
  * redo any page update (except page split)
  */
@@ -112,8 +135,8 @@ gistRedoPageUpdateRecord(XLogReaderState *record)
 			data += sizeof(OffsetNumber) * xldata->ntodelete;
 
 			PageIndexMultiDelete(page, todelete, xldata->ntodelete);
-			if (GistPageIsLeaf(page))
-				GistMarkTuplesDeleted(page);
+
+			GistMarkTuplesDeleted(page);
 		}
 
 		/* Add new tuples if any */
@@ -324,6 +347,9 @@ gist_redo(XLogReaderState *record)
 		case XLOG_GIST_CREATE_INDEX:
 			gistRedoCreateIndex(record);
 			break;
+		case XLOG_GIST_PAGE_DELETE:
+			gistRedoPageSetDeleted(record);
+			break;
 		default:
 			elog(PANIC, "gist_redo: unknown op code %u", info);
 	}
@@ -442,6 +468,23 @@ gistXLogSplit(bool page_is_leaf,
 	return recptr;
 }
 
+
+XLogRecPtr
+gistXLogSetDeleted(RelFileNode node, Buffer buffer, TransactionId xid) {
+	gistxlogPageDelete xlrec;
+	XLogRecPtr	recptr;
+
+	xlrec.id = xid;
+
+	XLogBeginInsert();
+	XLogRegisterData((char *) &xlrec, sizeof(gistxlogPageDelete));
+
+	XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
+	/* new tuples */
+	recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_DELETE);
+	return recptr;
+}
+
 /*
  * Write XLOG record describing a page update. The update can include any
  * number of deletions and/or insertions of tuples on a single index page.
diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h
index eb1c6728d4..db7ceaf56f 100644
--- a/src/include/access/gist_private.h
+++ b/src/include/access/gist_private.h
@@ -16,6 +16,7 @@
 
 #include "access/amapi.h"
 #include "access/gist.h"
+#include "access/gistxlog.h"
 #include "access/itup.h"
 #include "fmgr.h"
 #include "lib/pairingheap.h"
@@ -51,6 +52,11 @@ typedef struct
 	char		tupledata[FLEXIBLE_ARRAY_MEMBER];
 } GISTNodeBufferPage;
 
+typedef struct
+{
+	BlockNumber childblkno;		/* hash key */
+	BlockNumber parentblkno;
+} ParentMapEntry;
 #define BUFFER_PAGE_DATA_OFFSET MAXALIGN(offsetof(GISTNodeBufferPage, tupledata))
 /* Returns free space in node buffer page */
 #define PAGE_FREE_SPACE(nbp) (nbp->freespace)
@@ -176,13 +182,6 @@ typedef struct GISTScanOpaqueData
 
 typedef GISTScanOpaqueData *GISTScanOpaque;
 
-/* despite the name, gistxlogPage is not part of any xlog record */
-typedef struct gistxlogPage
-{
-	BlockNumber blkno;
-	int			num;			/* number of index tuples following */
-} gistxlogPage;
-
 /* SplitedPageLayout - gistSplit function result */
 typedef struct SplitedPageLayout
 {
@@ -409,6 +408,16 @@ extern bool gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
 extern SplitedPageLayout *gistSplit(Relation r, Page page, IndexTuple *itup,
 		  int len, GISTSTATE *giststate);
 
+/* gistxlog.c */
+extern void gist_redo(XLogReaderState *record);
+extern void gist_desc(StringInfo buf, XLogReaderState *record);
+extern const char *gist_identify(uint8 info);
+extern void gist_xlog_startup(void);
+extern void gist_xlog_cleanup(void);
+
+extern XLogRecPtr gistXLogSetDeleted(RelFileNode node, Buffer buffer,
+					TransactionId xid);
+
 extern XLogRecPtr gistXLogUpdate(Buffer buffer,
 			   OffsetNumber *todelete, int ntodelete,
 			   IndexTuple *itup, int ntup,
diff --git a/src/include/access/gistxlog.h b/src/include/access/gistxlog.h
index 3b126eca2a..52ab3a9308 100644
--- a/src/include/access/gistxlog.h
+++ b/src/include/access/gistxlog.h
@@ -17,12 +17,14 @@
 #include "access/xlogreader.h"
 #include "lib/stringinfo.h"
 
+/* XLog stuff */
+
 #define XLOG_GIST_PAGE_UPDATE		0x00
  /* #define XLOG_GIST_NEW_ROOT			 0x20 */	/* not used anymore */
 #define XLOG_GIST_PAGE_SPLIT		0x30
  /* #define XLOG_GIST_INSERT_COMPLETE	 0x40 */	/* not used anymore */
 #define XLOG_GIST_CREATE_INDEX		0x50
- /* #define XLOG_GIST_PAGE_DELETE		 0x60 */	/* not used anymore */
+#define XLOG_GIST_PAGE_DELETE		 0x60
 
 /*
  * Backup Blk 0: updated page.
@@ -59,6 +61,18 @@ typedef struct gistxlogPageSplit
 	 */
 } gistxlogPageSplit;
 
+typedef struct gistxlogPageDelete
+{
+   TransactionId id;
+} gistxlogPageDelete;
+
+/* despite the name, gistxlogPage is not part of any xlog record */
+typedef struct gistxlogPage
+{
+   BlockNumber blkno;
+   int			num;			/* number of index tuples following */
+} gistxlogPage;
+
 extern void gist_redo(XLogReaderState *record);
 extern void gist_desc(StringInfo buf, XLogReaderState *record);
 extern const char *gist_identify(uint8 info);
diff --git a/src/test/regress/expected/gist.out b/src/test/regress/expected/gist.out
index f5a2993aaf..5b92f08c74 100644
--- a/src/test/regress/expected/gist.out
+++ b/src/test/regress/expected/gist.out
@@ -27,9 +27,7 @@ insert into gist_point_tbl (id, p)
 select g+100000, point(g*10+1, g*10+1) from generate_series(1, 10000) g;
 -- To test vacuum, delete some entries from all over the index.
 delete from gist_point_tbl where id % 2 = 1;
--- And also delete some concentration of values. (GiST doesn't currently
--- attempt to delete pages even when they become empty, but if it did, this
--- would exercise it)
+-- And also delete some concentration of values.
 delete from gist_point_tbl where id < 10000;
 vacuum analyze gist_point_tbl;
 -- rebuild the index with a different fillfactor
diff --git a/src/test/regress/sql/gist.sql b/src/test/regress/sql/gist.sql
index bae722fe13..e66396e851 100644
--- a/src/test/regress/sql/gist.sql
+++ b/src/test/regress/sql/gist.sql
@@ -28,9 +28,7 @@ select g+100000, point(g*10+1, g*10+1) from generate_series(1, 10000) g;
 -- To test vacuum, delete some entries from all over the index.
 delete from gist_point_tbl where id % 2 = 1;
 
--- And also delete some concentration of values. (GiST doesn't currently
--- attempt to delete pages even when they become empty, but if it did, this
--- would exercise it)
+-- And also delete some concentration of values.
 delete from gist_point_tbl where id < 10000;
 
 vacuum analyze gist_point_tbl;
-- 
2.14.3 (Apple Git-98)

#8Andrey Borodin
x4mmm@yandex-team.ru
In reply to: Andrey Borodin (#7)
2 attachment(s)
Re: New gist vacuum.

Hi hackers!

19 дек. 2017 г., в 15:58, Andrey Borodin <x4mmm@yandex-team.ru> написал(а):
Here is the patch that deletes pages during GiST VACUUM.

Here is new version of the patch for GiST VACUUM.
There are two main changes:
1. During rescan for page deletion only know to be recently empty pages are rescanned.
2. I've re-implemented physical scan with array instead of hash table.

Thanks!
Merry Christmas and happy New Year.

Best regards, Andrey Borodin.

Attachments:

0001-Delete-pages-during-GiST-VACCUM.patchapplication/octet-stream; name=0001-Delete-pages-during-GiST-VACCUM.patch; x-unix-mode=0644Download
From 3269896fc63041503788007874223c148fe67bf8 Mon Sep 17 00:00:00 2001
From: Andrey Borodin <amborodin@acm.org>
Date: Tue, 19 Dec 2017 15:38:32 +0500
Subject: [PATCH 1/2] Delete pages during GiST VACCUM

---
 src/backend/access/gist/gist.c       |   7 +++
 src/backend/access/gist/gistbuild.c  |   5 --
 src/backend/access/gist/gistutil.c   |   4 +-
 src/backend/access/gist/gistvacuum.c | 119 +++++++++++++++++++++++++++++++++--
 src/backend/access/gist/gistxlog.c   |  47 +++++++++++++-
 src/include/access/gist_private.h    |  23 ++++---
 src/include/access/gistxlog.h        |  16 ++++-
 src/test/regress/expected/gist.out   |   4 +-
 src/test/regress/sql/gist.sql        |   4 +-
 9 files changed, 203 insertions(+), 26 deletions(-)

diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index cf4b319b4e..2dc6bedf8f 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -693,6 +693,13 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate)
 			GISTInsertStack *item;
 			OffsetNumber downlinkoffnum;
 
+			if(GistPageIsDeleted(stack->page))
+			{
+				UnlockReleaseBuffer(stack->buffer);
+				xlocked = false;
+				state.stack = stack = stack->parent;
+				continue;
+			}
 			downlinkoffnum = gistchoose(state.r, stack->page, itup, giststate);
 			iid = PageGetItemId(stack->page, downlinkoffnum);
 			idxtuple = (IndexTuple) PageGetItem(stack->page, iid);
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
index 2415f00e06..fee533966f 100644
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -1126,11 +1126,6 @@ gistGetMaxLevel(Relation index)
  * but will be added there the first time we visit them.
  */
 
-typedef struct
-{
-	BlockNumber childblkno;		/* hash key */
-	BlockNumber parentblkno;
-} ParentMapEntry;
 
 static void
 gistInitParentMap(GISTBuildState *buildstate)
diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c
index d8d1c0acfc..3f6fff011b 100644
--- a/src/backend/access/gist/gistutil.c
+++ b/src/backend/access/gist/gistutil.c
@@ -24,6 +24,7 @@
 #include "storage/lmgr.h"
 #include "utils/builtins.h"
 #include "utils/syscache.h"
+#include "utils/snapmgr.h"
 
 
 /*
@@ -801,13 +802,14 @@ gistNewBuffer(Relation r)
 		if (ConditionalLockBuffer(buffer))
 		{
 			Page		page = BufferGetPage(buffer);
+			PageHeader	p = (PageHeader) page;
 
 			if (PageIsNew(page))
 				return buffer;	/* OK to use, if never initialized */
 
 			gistcheckpage(r, buffer);
 
-			if (GistPageIsDeleted(page))
+			if (GistPageIsDeleted(page) && TransactionIdPrecedes(p->pd_prune_xid, RecentGlobalDataXmin))
 				return buffer;	/* OK to use */
 
 			LockBuffer(buffer, GIST_UNLOCK);
diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index 77d9d12f0b..d191abe0fb 100644
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@@ -20,6 +20,8 @@
 #include "miscadmin.h"
 #include "storage/indexfsm.h"
 #include "storage/lmgr.h"
+#include "utils/snapmgr.h"
+#include "access/xact.h"
 
 
 /*
@@ -126,7 +128,6 @@ pushStackIfSplited(Page page, GistBDItem *stack)
 	}
 }
 
-
 /*
  * Bulk deletion of all index entries pointing to a set of heap tuples and
  * check invalid tuples left after upgrade.
@@ -136,12 +137,14 @@ pushStackIfSplited(Page page, GistBDItem *stack)
  * Result: a palloc'd struct containing statistical info for VACUUM displays.
  */
 IndexBulkDeleteResult *
-gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
-			   IndexBulkDeleteCallback callback, void *callback_state)
+gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkDeleteCallback callback, void* callback_state)
 {
 	Relation	rel = info->index;
 	GistBDItem *stack,
 			   *ptr;
+	BlockNumber recentParent = InvalidBlockNumber;
+	List	   *rescanList = NULL;
+	ListCell   *cell;
 
 	/* first time through? */
 	if (stats == NULL)
@@ -234,9 +237,16 @@ gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 				END_CRIT_SECTION();
 			}
 
+			if (ntodelete == maxoff && recentParent!=InvalidBlockNumber &&
+				(rescanList == NULL || (BlockNumber)llast_int(rescanList) != recentParent))
+			{
+				/* This page is a candidate to be deleted. Remember it's parent to rescan it later with xlock */
+				rescanList = lappend_int(rescanList, recentParent);
+			}
 		}
 		else
 		{
+			recentParent = stack->blkno;
 			/* check for split proceeded after look at parent */
 			pushStackIfSplited(page, stack);
 
@@ -271,5 +281,106 @@ gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 		vacuum_delay_point();
 	}
 
+	/* rescan inner pages that had empty child pages */
+	foreach(cell,rescanList)
+	{
+		Buffer		buffer;
+		Page		page;
+		OffsetNumber i,
+					maxoff;
+		IndexTuple	idxtuple;
+		ItemId		iid;
+		OffsetNumber todelete[MaxOffsetNumber];
+		Buffer		buftodelete[MaxOffsetNumber];
+		int			ntodelete = 0;
+
+		buffer = ReadBufferExtended(rel, MAIN_FORKNUM, (BlockNumber)lfirst_int(cell),
+									RBM_NORMAL, info->strategy);
+		LockBuffer(buffer, GIST_EXCLUSIVE);
+		gistcheckpage(rel, buffer);
+		page = (Page) BufferGetPage(buffer);
+
+		Assert(!GistPageIsLeaf(page));
+
+		maxoff = PageGetMaxOffsetNumber(page);
+
+		for (i = OffsetNumberNext(FirstOffsetNumber); i <= maxoff; i = OffsetNumberNext(i))
+		{
+			Buffer		leafBuffer;
+			Page		leafPage;
+
+			iid = PageGetItemId(page, i);
+			idxtuple = (IndexTuple) PageGetItem(page, iid);
+
+			leafBuffer = ReadBufferExtended(rel, MAIN_FORKNUM, ItemPointerGetBlockNumber(&(idxtuple->t_tid)),
+								RBM_NORMAL, info->strategy);
+			LockBuffer(leafBuffer, GIST_EXCLUSIVE);
+			gistcheckpage(rel, leafBuffer);
+			leafPage = (Page) BufferGetPage(leafBuffer);
+			Assert(GistPageIsLeaf(leafPage));
+
+			if (PageGetMaxOffsetNumber(leafPage) == InvalidOffsetNumber /* Nothing left to split */
+				&& !(GistFollowRight(leafPage) || GistPageGetNSN(page) < GistPageGetNSN(leafPage)) /* No follow-right */
+				&& ntodelete < maxoff-1) /* We must keep at least one leaf page per each */
+			{
+				buftodelete[ntodelete] = leafBuffer;
+				todelete[ntodelete++] = i;
+			}
+			else
+				UnlockReleaseBuffer(leafBuffer);
+		}
+
+
+		if (ntodelete)
+		{
+			/* Drop references from internal page */
+			TransactionId txid = GetCurrentTransactionId();
+			START_CRIT_SECTION();
+
+			MarkBufferDirty(buffer);
+				PageIndexMultiDelete(page, todelete, ntodelete);
+
+			if (RelationNeedsWAL(rel))
+			{
+				XLogRecPtr	recptr;
+
+				recptr = gistXLogUpdate(buffer, todelete, ntodelete, NULL, 0, InvalidBuffer);
+					PageSetLSN(page, recptr);
+			}
+			else
+				PageSetLSN(page, gistGetFakeLSN(rel));
+
+			/* Mark pages as deleted */
+			for (i = 0; i < ntodelete; i++)
+			{
+				Page		leafPage = (Page)BufferGetPage(buftodelete[i]);
+				PageHeader	header = (PageHeader)leafPage;
+
+				header->pd_prune_xid = txid;
+
+				GistPageSetDeleted(leafPage);
+				MarkBufferDirty(buftodelete[i]);
+				stats->pages_deleted++;
+
+				if (RelationNeedsWAL(rel))
+				{
+					XLogRecPtr recptr = gistXLogSetDeleted(rel->rd_node, buftodelete[i], header->pd_prune_xid);
+					PageSetLSN(leafPage, recptr);
+				}
+				else
+					PageSetLSN(leafPage, gistGetFakeLSN(rel));
+
+				UnlockReleaseBuffer(buftodelete[i]);
+			}
+			END_CRIT_SECTION();
+		}
+
+		UnlockReleaseBuffer(buffer);
+
+		vacuum_delay_point();
+	}
+
+	list_free(rescanList);
+
 	return stats;
-}
+}
\ No newline at end of file
diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c
index 7fd91ce640..0a008a4a77 100644
--- a/src/backend/access/gist/gistxlog.c
+++ b/src/backend/access/gist/gistxlog.c
@@ -60,6 +60,29 @@ gistRedoClearFollowRight(XLogReaderState *record, uint8 block_id)
 		UnlockReleaseBuffer(buffer);
 }
 
+static void
+gistRedoPageSetDeleted(XLogReaderState *record)
+{
+	XLogRecPtr	lsn = record->EndRecPtr;
+	gistxlogPageDelete *xldata = (gistxlogPageDelete *) XLogRecGetData(record);
+	Buffer		buffer;
+	Page		page;
+	PageHeader		header;
+
+	if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
+	{
+		page = (Page) BufferGetPage(buffer);
+		header = (PageHeader) page;
+
+		header->pd_prune_xid = xldata->id;
+		GistPageSetDeleted(page);
+
+		PageSetLSN(page, lsn);
+		MarkBufferDirty(buffer);
+
+		UnlockReleaseBuffer(buffer);
+	}
+}
 /*
  * redo any page update (except page split)
  */
@@ -112,8 +135,8 @@ gistRedoPageUpdateRecord(XLogReaderState *record)
 			data += sizeof(OffsetNumber) * xldata->ntodelete;
 
 			PageIndexMultiDelete(page, todelete, xldata->ntodelete);
-			if (GistPageIsLeaf(page))
-				GistMarkTuplesDeleted(page);
+
+			GistMarkTuplesDeleted(page);
 		}
 
 		/* Add new tuples if any */
@@ -324,6 +347,9 @@ gist_redo(XLogReaderState *record)
 		case XLOG_GIST_CREATE_INDEX:
 			gistRedoCreateIndex(record);
 			break;
+		case XLOG_GIST_PAGE_DELETE:
+			gistRedoPageSetDeleted(record);
+			break;
 		default:
 			elog(PANIC, "gist_redo: unknown op code %u", info);
 	}
@@ -442,6 +468,23 @@ gistXLogSplit(bool page_is_leaf,
 	return recptr;
 }
 
+
+XLogRecPtr
+gistXLogSetDeleted(RelFileNode node, Buffer buffer, TransactionId xid) {
+	gistxlogPageDelete xlrec;
+	XLogRecPtr	recptr;
+
+	xlrec.id = xid;
+
+	XLogBeginInsert();
+	XLogRegisterData((char *) &xlrec, sizeof(gistxlogPageDelete));
+
+	XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
+	/* new tuples */
+	recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_DELETE);
+	return recptr;
+}
+
 /*
  * Write XLOG record describing a page update. The update can include any
  * number of deletions and/or insertions of tuples on a single index page.
diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h
index eb1c6728d4..db7ceaf56f 100644
--- a/src/include/access/gist_private.h
+++ b/src/include/access/gist_private.h
@@ -16,6 +16,7 @@
 
 #include "access/amapi.h"
 #include "access/gist.h"
+#include "access/gistxlog.h"
 #include "access/itup.h"
 #include "fmgr.h"
 #include "lib/pairingheap.h"
@@ -51,6 +52,11 @@ typedef struct
 	char		tupledata[FLEXIBLE_ARRAY_MEMBER];
 } GISTNodeBufferPage;
 
+typedef struct
+{
+	BlockNumber childblkno;		/* hash key */
+	BlockNumber parentblkno;
+} ParentMapEntry;
 #define BUFFER_PAGE_DATA_OFFSET MAXALIGN(offsetof(GISTNodeBufferPage, tupledata))
 /* Returns free space in node buffer page */
 #define PAGE_FREE_SPACE(nbp) (nbp->freespace)
@@ -176,13 +182,6 @@ typedef struct GISTScanOpaqueData
 
 typedef GISTScanOpaqueData *GISTScanOpaque;
 
-/* despite the name, gistxlogPage is not part of any xlog record */
-typedef struct gistxlogPage
-{
-	BlockNumber blkno;
-	int			num;			/* number of index tuples following */
-} gistxlogPage;
-
 /* SplitedPageLayout - gistSplit function result */
 typedef struct SplitedPageLayout
 {
@@ -409,6 +408,16 @@ extern bool gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
 extern SplitedPageLayout *gistSplit(Relation r, Page page, IndexTuple *itup,
 		  int len, GISTSTATE *giststate);
 
+/* gistxlog.c */
+extern void gist_redo(XLogReaderState *record);
+extern void gist_desc(StringInfo buf, XLogReaderState *record);
+extern const char *gist_identify(uint8 info);
+extern void gist_xlog_startup(void);
+extern void gist_xlog_cleanup(void);
+
+extern XLogRecPtr gistXLogSetDeleted(RelFileNode node, Buffer buffer,
+					TransactionId xid);
+
 extern XLogRecPtr gistXLogUpdate(Buffer buffer,
 			   OffsetNumber *todelete, int ntodelete,
 			   IndexTuple *itup, int ntup,
diff --git a/src/include/access/gistxlog.h b/src/include/access/gistxlog.h
index 3b126eca2a..52ab3a9308 100644
--- a/src/include/access/gistxlog.h
+++ b/src/include/access/gistxlog.h
@@ -17,12 +17,14 @@
 #include "access/xlogreader.h"
 #include "lib/stringinfo.h"
 
+/* XLog stuff */
+
 #define XLOG_GIST_PAGE_UPDATE		0x00
  /* #define XLOG_GIST_NEW_ROOT			 0x20 */	/* not used anymore */
 #define XLOG_GIST_PAGE_SPLIT		0x30
  /* #define XLOG_GIST_INSERT_COMPLETE	 0x40 */	/* not used anymore */
 #define XLOG_GIST_CREATE_INDEX		0x50
- /* #define XLOG_GIST_PAGE_DELETE		 0x60 */	/* not used anymore */
+#define XLOG_GIST_PAGE_DELETE		 0x60
 
 /*
  * Backup Blk 0: updated page.
@@ -59,6 +61,18 @@ typedef struct gistxlogPageSplit
 	 */
 } gistxlogPageSplit;
 
+typedef struct gistxlogPageDelete
+{
+   TransactionId id;
+} gistxlogPageDelete;
+
+/* despite the name, gistxlogPage is not part of any xlog record */
+typedef struct gistxlogPage
+{
+   BlockNumber blkno;
+   int			num;			/* number of index tuples following */
+} gistxlogPage;
+
 extern void gist_redo(XLogReaderState *record);
 extern void gist_desc(StringInfo buf, XLogReaderState *record);
 extern const char *gist_identify(uint8 info);
diff --git a/src/test/regress/expected/gist.out b/src/test/regress/expected/gist.out
index f5a2993aaf..5b92f08c74 100644
--- a/src/test/regress/expected/gist.out
+++ b/src/test/regress/expected/gist.out
@@ -27,9 +27,7 @@ insert into gist_point_tbl (id, p)
 select g+100000, point(g*10+1, g*10+1) from generate_series(1, 10000) g;
 -- To test vacuum, delete some entries from all over the index.
 delete from gist_point_tbl where id % 2 = 1;
--- And also delete some concentration of values. (GiST doesn't currently
--- attempt to delete pages even when they become empty, but if it did, this
--- would exercise it)
+-- And also delete some concentration of values.
 delete from gist_point_tbl where id < 10000;
 vacuum analyze gist_point_tbl;
 -- rebuild the index with a different fillfactor
diff --git a/src/test/regress/sql/gist.sql b/src/test/regress/sql/gist.sql
index bae722fe13..e66396e851 100644
--- a/src/test/regress/sql/gist.sql
+++ b/src/test/regress/sql/gist.sql
@@ -28,9 +28,7 @@ select g+100000, point(g*10+1, g*10+1) from generate_series(1, 10000) g;
 -- To test vacuum, delete some entries from all over the index.
 delete from gist_point_tbl where id % 2 = 1;
 
--- And also delete some concentration of values. (GiST doesn't currently
--- attempt to delete pages even when they become empty, but if it did, this
--- would exercise it)
+-- And also delete some concentration of values.
 delete from gist_point_tbl where id < 10000;
 
 vacuum analyze gist_point_tbl;
-- 
2.14.3 (Apple Git-98)

0002-Physical-GiST-scan.patchapplication/octet-stream; name=0002-Physical-GiST-scan.patch; x-unix-mode=0644Download
From c9b32e82173e53afbc5f36da37bdb8e5faea7865 Mon Sep 17 00:00:00 2001
From: Andrey Borodin <amborodin@acm.org>
Date: Thu, 28 Dec 2017 15:59:30 +0500
Subject: [PATCH 2/2] Physical GiST scan

---
 src/backend/access/gist/gistvacuum.c | 320 ++++++++++++++++++++++++++++++-----
 1 file changed, 278 insertions(+), 42 deletions(-)

diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index d191abe0fb..891e3144ca 100644
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@@ -103,8 +103,9 @@ gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
 typedef struct GistBDItem
 {
-	GistNSN		parentlsn;
-	BlockNumber blkno;
+	GistNSN		 parentlsn;
+	BlockNumber  blkno;
+	OffsetNumber parentoffset;
 	struct GistBDItem *next;
 } GistBDItem;
 
@@ -128,31 +129,189 @@ pushStackIfSplited(Page page, GistBDItem *stack)
 	}
 }
 
-/*
- * Bulk deletion of all index entries pointing to a set of heap tuples and
- * check invalid tuples left after upgrade.
- * The set of target tuples is specified via a callback routine that tells
- * whether any given heap tuple (identified by ItemPointer) is being deleted.
- *
- * Result: a palloc'd struct containing statistical info for VACUUM displays.
- */
-IndexBulkDeleteResult *
-gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkDeleteCallback callback, void* callback_state)
+
+#define GIST_PS_HAS_PARENT 1
+#define GIST_PS_EMPTY_LEAF 2
+
+
+/* Physiscal scan item */
+typedef struct GistPSItem
 {
-	Relation	rel = info->index;
-	GistBDItem *stack,
-			   *ptr;
-	BlockNumber recentParent = InvalidBlockNumber;
-	List	   *rescanList = NULL;
-	ListCell   *cell;
+	BlockNumber  parent;
+	List*        emptyLeafOffsets;
+	OffsetNumber parentOffset;
+	uint16_t     flags;
+} GistPSItem;
+
+typedef struct GistRescanItem
+{
+	BlockNumber       blkno;
+	List*             emptyLeafOffsets;
+	struct GistRescanItem* next;
+} GistRescanItem;
+
+static GistRescanItem*
+gistbulkdeletephysicalcan(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkDeleteCallback callback, void* callback_state, BlockNumber npages)
+{
+	Relation	     rel = info->index;
+	GistRescanItem *result = NULL;
+	BlockNumber      blkno;
+
+    GistPSItem *graph = palloc0(npages * sizeof(GistPSItem));
+
+
+	for (blkno = GIST_ROOT_BLKNO; blkno < npages; blkno++)
+	{
+		Buffer		 buffer;
+		Page		 page;
+		OffsetNumber i,
+		             maxoff;
+		IndexTuple   idxtuple;
+		ItemId	     iid;
+
+		vacuum_delay_point();
+
+		buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
+									info->strategy);
+		/* 
+		 * We are not going to stay here for a long time, calling recursive algorithms.
+		 * Especially for an internal page. So, agressivly grab an exclusive lock.
+		 */
+		LockBuffer(buffer, GIST_EXCLUSIVE);
+		page = (Page) BufferGetPage(buffer);
+
+		if (PageIsNew(page) || GistPageIsDeleted(page))
+		{
+		    UnlockReleaseBuffer(buffer);			
+            /* Should not we record free page here? */
+			continue;
+		}
+
+		maxoff = PageGetMaxOffsetNumber(page);
+
+		if (GistPageIsLeaf(page))
+		{
+			OffsetNumber todelete[MaxOffsetNumber];
+			int			ntodelete = 0;
+
+			/*
+			 * Remove deletable tuples from page
+			 */
+
+			for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
+			{
+				iid = PageGetItemId(page, i);
+				idxtuple = (IndexTuple) PageGetItem(page, iid);
+
+				if (callback(&(idxtuple->t_tid), callback_state))
+					todelete[ntodelete++] = i;
+				else
+					stats->num_index_tuples += 1;
+			}
+
+			stats->tuples_removed += ntodelete;
+
+			if (ntodelete)
+			{
+				START_CRIT_SECTION();
+
+				MarkBufferDirty(buffer);
+
+				PageIndexMultiDelete(page, todelete, ntodelete);
+				GistMarkTuplesDeleted(page);
+
+				if (RelationNeedsWAL(rel))
+				{
+					XLogRecPtr	recptr;
+
+					recptr = gistXLogUpdate(buffer,
+											todelete, ntodelete,
+											NULL, 0, InvalidBuffer);
+					PageSetLSN(page, recptr);
+				}
+				else
+					PageSetLSN(page, gistGetFakeLSN(rel));
+
+				END_CRIT_SECTION();
+			}
+
+			if (ntodelete == maxoff)
+			{
+				/* This page is a candidate to be deleted. Remember it's parent to rescan it later with xlock */
+				if (graph[blkno].flags & GIST_PS_HAS_PARENT)
+				{
+					/* Go to parent and append myself */
+					BlockNumber parentblockno = graph[blkno].parent;
+					graph[parentblockno].emptyLeafOffsets = lappend_int(graph[parentblockno].emptyLeafOffsets, (int)graph[blkno].parentOffset);
+				}
+				else
+				{
+					/* Parent will collect me later */
+					graph[blkno].flags |= GIST_PS_EMPTY_LEAF;
+				}
+			}
+		}
+		else
+		{
+			for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
+			{
+				BlockNumber childblkno;
+				iid = PageGetItemId(page, i);
+				idxtuple = (IndexTuple) PageGetItem(page, iid);
+				childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
+
+				if (graph[childblkno].flags & GIST_PS_EMPTY_LEAF)
+				{
+					/* Child has been scanned earlier and is ready to be picked up */
+					graph[blkno].emptyLeafOffsets = lappend_int(graph[blkno].emptyLeafOffsets, i);
+				}
+				else
+				{
+					/* Collect leaf when scan will come close */
+				    graph[childblkno].parent = blkno;
+				    graph[childblkno].parentOffset = i;
+				    graph[childblkno].flags |= GIST_PS_HAS_PARENT;
+				}
 
-	/* first time through? */
-	if (stats == NULL)
-		stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
-	/* we'll re-count the tuples each time */
-	stats->estimated_count = false;
-	stats->num_index_tuples = 0;
 
+				if (GistTupleIsInvalid(idxtuple))
+					ereport(LOG,
+							(errmsg("index \"%s\" contains an inner tuple marked as invalid",
+									RelationGetRelationName(rel)),
+							 errdetail("This is caused by an incomplete page split at crash recovery before upgrading to PostgreSQL 9.1."),
+							 errhint("Please REINDEX it.")));
+			}
+		}
+		UnlockReleaseBuffer(buffer);
+	}
+
+	/* Search for internal pages pointing to empty leafs */
+	for (blkno = GIST_ROOT_BLKNO; blkno < npages; blkno++)
+	{
+		if (graph[blkno].emptyLeafOffsets)
+		{
+			GistRescanItem *next = palloc(sizeof(GistRescanItem));
+			next->blkno = blkno;
+			next->emptyLeafOffsets = graph[blkno].emptyLeafOffsets;
+			next->next = result;
+			result = next;
+		}
+	}
+
+	pfree(graph);
+
+	return result;
+}
+
+static GistRescanItem*
+gistbulkdeletelogicalscan(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkDeleteCallback callback, void* callback_state)
+{
+	Relation        rel = info->index;
+	BlockNumber     recentParent = InvalidBlockNumber;
+	GistBDItem     *stack,
+			       *ptr;
+	GistRescanItem *result = NULL;
+	
 	stack = (GistBDItem *) palloc0(sizeof(GistBDItem));
 	stack->blkno = GIST_ROOT_BLKNO;
 
@@ -237,11 +396,18 @@ gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkD
 				END_CRIT_SECTION();
 			}
 
-			if (ntodelete == maxoff && recentParent!=InvalidBlockNumber &&
-				(rescanList == NULL || (BlockNumber)llast_int(rescanList) != recentParent))
+			if (ntodelete == maxoff && recentParent!=InvalidBlockNumber)
 			{
 				/* This page is a candidate to be deleted. Remember it's parent to rescan it later with xlock */
-				rescanList = lappend_int(rescanList, recentParent);
+				if (result == NULL || result->blkno != recentParent)
+				{
+			        GistRescanItem *next = palloc(sizeof(GistRescanItem));
+					next->blkno = recentParent;
+					next->emptyLeafOffsets = NULL;
+					next->next = result;
+					result = next;					
+				}
+				result->emptyLeafOffsets = lappend_int(result->emptyLeafOffsets, stack->parentoffset);
 			}
 		}
 		else
@@ -261,6 +427,7 @@ gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkD
 				ptr->blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
 				ptr->parentlsn = PageGetLSN(page);
 				ptr->next = stack->next;
+				ptr->parentoffset = i;
 				stack->next = ptr;
 
 				if (GistTupleIsInvalid(idxtuple))
@@ -281,20 +448,78 @@ gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkD
 		vacuum_delay_point();
 	}
 
-	/* rescan inner pages that had empty child pages */
-	foreach(cell,rescanList)
+	return result;
+}
+
+static int
+compare_offsetnumber(const void *x, const void *y)
+{
+	OffsetNumber a = *((OffsetNumber *)x);
+	OffsetNumber b = *((OffsetNumber *)y);
+	return a - b;
+}
+
+/*
+ * Bulk deletion of all index entries pointing to a set of heap tuples and
+ * check invalid tuples left after upgrade.
+ * The set of target tuples is specified via a callback routine that tells
+ * whether any given heap tuple (identified by ItemPointer) is being deleted.
+ *
+ * Result: a palloc'd struct containing statistical info for VACUUM displays.
+ */
+IndexBulkDeleteResult *
+gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkDeleteCallback callback, void* callback_state)
+{
+	Relation		rel = info->index;
+	GistRescanItem *rescan;
+	BlockNumber		npages;
+	bool			needLock;
+
+	/* first time through? */
+	if (stats == NULL)
+		stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
+	/* we'll re-count the tuples each time */
+	stats->estimated_count = false;
+	stats->num_index_tuples = 0;
+
+	/*
+	 * Need lock unless it's local to this backend.
+	 */
+	needLock = !RELATION_IS_LOCAL(rel);
+
+	/* try to find deleted pages */
+	if (needLock)
+		LockRelationForExtension(rel, ExclusiveLock);
+	npages = RelationGetNumberOfBlocks(rel);
+	if (needLock)
+		UnlockRelationForExtension(rel, ExclusiveLock);
+
+	/* If we have enough space to contruct map of whole graph, then we can do sequential reading of all index */
+	if(npages * (sizeof(GistPSItem)) > maintenance_work_mem * 1024)
 	{
-		Buffer		buffer;
-		Page		page;
-		OffsetNumber i,
-					maxoff;
-		IndexTuple	idxtuple;
-		ItemId		iid;
-		OffsetNumber todelete[MaxOffsetNumber];
-		Buffer		buftodelete[MaxOffsetNumber];
-		int			ntodelete = 0;
+		rescan = gistbulkdeletelogicalscan(info, stats, callback, callback_state);
+	}
+	else
+	{
+		rescan = gistbulkdeletephysicalcan(info, stats, callback, callback_state, npages);
+	}
 
-		buffer = ReadBufferExtended(rel, MAIN_FORKNUM, (BlockNumber)lfirst_int(cell),
+	/* rescan inner pages that had empty child pages */
+	while(rescan)
+	{
+		Buffer			 buffer;
+		Page			 page;
+		OffsetNumber 	 i,
+						 maxoff;
+		IndexTuple		 idxtuple;
+		ItemId			 iid;
+		OffsetNumber 	 todelete[MaxOffsetNumber];
+		Buffer			 buftodelete[MaxOffsetNumber];
+		int				 ntodelete = 0;
+		ListCell  		*cell;
+		GistRescanItem	*oldRescan;
+
+		buffer = ReadBufferExtended(rel, MAIN_FORKNUM, rescan->blkno,
 									RBM_NORMAL, info->strategy);
 		LockBuffer(buffer, GIST_EXCLUSIVE);
 		gistcheckpage(rel, buffer);
@@ -304,11 +529,18 @@ gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkD
 
 		maxoff = PageGetMaxOffsetNumber(page);
 
-		for (i = OffsetNumberNext(FirstOffsetNumber); i <= maxoff; i = OffsetNumberNext(i))
+		/* Check that leafs are still empty and decide what to delete */
+		foreach(cell, rescan->emptyLeafOffsets)
 		{
 			Buffer		leafBuffer;
 			Page		leafPage;
 
+			i = (OffsetNumber)lfirst_int(cell);
+			if(i > maxoff)
+			{
+				continue;
+			}
+
 			iid = PageGetItemId(page, i);
 			idxtuple = (IndexTuple) PageGetItem(page, iid);
 
@@ -338,7 +570,8 @@ gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkD
 			START_CRIT_SECTION();
 
 			MarkBufferDirty(buffer);
-				PageIndexMultiDelete(page, todelete, ntodelete);
+			qsort(todelete, ntodelete, sizeof(OffsetNumber), compare_offsetnumber);
+			PageIndexMultiDelete(page, todelete, ntodelete);
 
 			if (RelationNeedsWAL(rel))
 			{
@@ -376,11 +609,14 @@ gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkD
 		}
 
 		UnlockReleaseBuffer(buffer);
+		oldRescan = rescan;
+		rescan = rescan->next;
+		list_free(oldRescan->emptyLeafOffsets);
+		pfree(oldRescan);
 
 		vacuum_delay_point();
 	}
 
-	list_free(rescanList);
 
 	return stats;
 }
\ No newline at end of file
-- 
2.14.3 (Apple Git-98)

#9Andrey Borodin
x4mmm@yandex-team.ru
In reply to: Andrey Borodin (#8)
1 attachment(s)
Re: New gist vacuum.

Hi!

28 дек. 2017 г., в 16:37, Andrey Borodin <x4mmm@yandex-team.ru> написал(а):
Here is new version of the patch for GiST VACUUM.
There are two main changes:
1. During rescan for page deletion only know to be recently empty pages are rescanned.
2. I've re-implemented physical scan with array instead of hash table.

There is one more minor spot in GiST VACUUM. It takes heap tuples count for statistics for partial indexes, while it should not.

If gistvacuumcleanup() is not given a statistics gathered by gistbulkdelete() it returns incorrect tuples count for partial index.
Here's the micropatch, which fixes that corner case.
To reproduce this effect I used this query:
create table y as select cube(random()) c from generate_series(1,10000) y; create index on y using gist(c) where c~>1 > 0.5;
vacuum verbose y;
Before patch it will report 10000 tuples, with patch it will report different values around 5000.

I do not know, should I register separate commitfest entry? The code is very close to main GiST VACUUM patch, but solves a bit different problem.

Best regards, Andrey Borodin.

Attachments:

0001-Count-tuples-correctly-during-GiST-VACUUM-of-partial.patchapplication/octet-stream; name=0001-Count-tuples-correctly-during-GiST-VACUUM-of-partial.patch; x-unix-mode=0644Download
From 6e67a65cc2e621424b5e6aa3e7b7524a02a54802 Mon Sep 17 00:00:00 2001
From: Andrey Borodin <amborodin@acm.org>
Date: Sat, 30 Dec 2017 14:13:06 +0500
Subject: [PATCH] Count tuples correctly during GiST VACUUM of partial index

---
 src/backend/access/gist/gistvacuum.c | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index 891e3144ca..1fc80ff42d 100644
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@@ -16,6 +16,7 @@
 
 #include "access/genam.h"
 #include "access/gist_private.h"
+#include "access/htup_details.h"
 #include "commands/vacuum.h"
 #include "miscadmin.h"
 #include "storage/indexfsm.h"
@@ -34,7 +35,9 @@ gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 	BlockNumber npages,
 				blkno;
 	BlockNumber totFreePages;
-	bool		needLock;
+	bool		needLock,
+				shouldcount = false;
+	uint64_t	tuples_count = 0;
 
 	/* No-op in ANALYZE ONLY mode */
 	if (info->analyze_only)
@@ -47,12 +50,13 @@ gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 		/* use heap's tuple count */
 		stats->num_index_tuples = info->num_heap_tuples;
 		stats->estimated_count = info->estimated_count;
-
-		/*
-		 * XXX the above is wrong if index is partial.  Would it be OK to just
-		 * return NULL, or is there work we must do below?
-		 */
+		/* Unless it is estimate or index is partial  */
+		if (rel->rd_indextuple == NULL)
+			RelationInitIndexAccessInfo(rel);
+		shouldcount = !heap_attisnull(rel->rd_indextuple, Anum_pg_index_indpred) || info->estimated_count;
 	}
+	else
+		shouldcount = stats->estimated_count;
 
 	/*
 	 * Need lock unless it's local to this backend.
@@ -84,12 +88,25 @@ gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 			totFreePages++;
 			RecordFreeIndexPage(rel, blkno);
 		}
+		else
+		{
+			if (shouldcount && GistPageIsLeaf(page))
+			{
+				tuples_count += PageGetMaxOffsetNumber(page);
+			}
+		}
 		UnlockReleaseBuffer(buffer);
 	}
 
 	/* Finally, vacuum the FSM */
 	IndexFreeSpaceMapVacuum(info->index);
 
+	if (shouldcount)
+	{
+		stats->num_index_tuples = tuples_count;
+		stats->estimated_count = false;
+	}
+
 	/* return statistics */
 	stats->pages_free = totFreePages;
 	if (needLock)
-- 
2.14.3 (Apple Git-98)

#10Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Andrey Borodin (#9)
1 attachment(s)
Re: New gist vacuum.

Hi!

On Sat, Dec 30, 2017 at 12:18 PM, Andrey Borodin <x4mmm@yandex-team.ru>
wrote:

28 дек. 2017 г., в 16:37, Andrey Borodin <x4mmm@yandex-team.ru>

написал(а):

Here is new version of the patch for GiST VACUUM.
There are two main changes:
1. During rescan for page deletion only know to be recently empty pages

are rescanned.

2. I've re-implemented physical scan with array instead of hash table.

There is one more minor spot in GiST VACUUM. It takes heap tuples count
for statistics for partial indexes, while it should not.

If gistvacuumcleanup() is not given a statistics gathered by
gistbulkdelete() it returns incorrect tuples count for partial index.
Here's the micropatch, which fixes that corner case.
To reproduce this effect I used this query:
create table y as select cube(random()) c from generate_series(1,10000) y;
create index on y using gist(c) where c~>1 > 0.5;
vacuum verbose y;
Before patch it will report 10000 tuples, with patch it will report
different values around 5000.

It's very good that you've fixed that.

I do not know, should I register separate commitfest entry? The code is

very close to main GiST VACUUM patch, but solves a bit different problem.

Yes, I think it deserves separate commitfest entry. Despite it's related
to GiST VACUUM, it's a separate fix.
I've made small improvements to this patch: variable naming, formatting,
comments.
BTW, do we really need to set shouldCount depending on whether we receive
stats argument or not? What if we always set shouldCount as in the first
branch of "if"?

shouldCount = !heap_attisnull(rel->rd_indextuple, Anum_pg_index_indpred) ||
info->estimated_count;

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

Attachments:

0001-Count-tuples-correctly-during-GiST-VACUUM-of-partial-2.patchapplication/octet-stream; name=0001-Count-tuples-correctly-during-GiST-VACUUM-of-partial-2.patchDownload
diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index 22181c6299..4868280ed7 100644
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@@ -16,6 +16,7 @@
 
 #include "access/genam.h"
 #include "access/gist_private.h"
+#include "access/htup_details.h"
 #include "commands/vacuum.h"
 #include "miscadmin.h"
 #include "storage/indexfsm.h"
@@ -32,7 +33,9 @@ gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 	BlockNumber npages,
 				blkno;
 	BlockNumber totFreePages;
-	bool		needLock;
+	bool		needLock,
+				shouldCount = false;
+	uint64_t	tuplesCount = 0;
 
 	/* No-op in ANALYZE ONLY mode */
 	if (info->analyze_only)
@@ -45,11 +48,15 @@ gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 		/* use heap's tuple count */
 		stats->num_index_tuples = info->num_heap_tuples;
 		stats->estimated_count = info->estimated_count;
-
-		/*
-		 * XXX the above is wrong if index is partial.  Would it be OK to just
-		 * return NULL, or is there work we must do below?
-		 */
+		/* unless it is estimate or index is partial  */
+		if (rel->rd_indextuple == NULL)
+			RelationInitIndexAccessInfo(rel);
+		shouldCount = !heap_attisnull(rel->rd_indextuple, Anum_pg_index_indpred) ||
+					   info->estimated_count;
+	}
+	else
+	{
+		shouldCount = stats->estimated_count;
 	}
 
 	/*
@@ -82,12 +89,27 @@ gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 			totFreePages++;
 			RecordFreeIndexPage(rel, blkno);
 		}
+		else
+		{
+			/* Count tuples in leaf pages if needed */
+			if (shouldCount && GistPageIsLeaf(page))
+			{
+				tuplesCount += PageGetMaxOffsetNumber(page);
+			}
+		}
 		UnlockReleaseBuffer(buffer);
 	}
 
 	/* Finally, vacuum the FSM */
 	IndexFreeSpaceMapVacuum(info->index);
 
+	/* Update index tuples stat to counted over leaf pages if needed */
+	if (shouldCount)
+	{
+		stats->num_index_tuples = tuplesCount;
+		stats->estimated_count = false;
+	}
+
 	/* return statistics */
 	stats->pages_free = totFreePages;
 	if (needLock)
#11Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Andrey Borodin (#8)
Re: New gist vacuum.

Hi!

On Thu, Dec 28, 2017 at 2:37 PM, Andrey Borodin <x4mmm@yandex-team.ru>
wrote:

19 дек. 2017 г., в 15:58, Andrey Borodin <x4mmm@yandex-team.ru>

написал(а):

Here is the patch that deletes pages during GiST VACUUM.

Here is new version of the patch for GiST VACUUM.
There are two main changes:
1. During rescan for page deletion only know to be recently empty pages
are rescanned.
2. I've re-implemented physical scan with array instead of hash table.

I'm very glad this patch isn't forgotten. I've assigned to review this
patch.
My first note on this patchset is following. These patches touches
sensitive aspects or GiST and are related to complex concurrency issues.
Thus, I'm sure both of them deserve high-level description in
src/backend/access/gist/README. Given this description, it would be much
easier to review the patches.

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

#12Andrey Borodin
x4mmm@yandex-team.ru
In reply to: Alexander Korotkov (#10)
Re: New gist vacuum.

Hi, Alexander!

Many thanks for looking into patches!
A little bit later I will provide answer in other branch of discussion.

15 янв. 2018 г., в 23:34, Alexander Korotkov <a.korotkov@postgrespro.ru> написал(а):

I do not know, should I register separate commitfest entry? The code is very close to main GiST VACUUM patch, but solves a bit different problem.

Yes, I think it deserves separate commitfest entry. Despite it's related to GiST VACUUM, it's a separate fix.

Ok, done. https://commitfest.postgresql.org/17/1483

I've made small improvements to this patch: variable naming, formatting, comments.

Great, thanks!

BTW, do we really need to set shouldCount depending on whether we receive stats argument or not? What if we always set shouldCount as in the first branch of "if"?

shouldCount = !heap_attisnull(rel->rd_indextuple, Anum_pg_index_indpred) ||
info->estimated_count;

We do not need to count if we have exact count from heap and this index is not partial. ITSM that it is quite common case.

Best regards, Andrey Borodin.

#13Andrey Borodin
x4mmm@yandex-team.ru
In reply to: Alexander Korotkov (#11)
1 attachment(s)
Re: New gist vacuum.

Hi!

15 янв. 2018 г., в 23:42, Alexander Korotkov <a.korotkov@postgrespro.ru> написал(а):

I'm very glad this patch isn't forgotten. I've assigned to review this patch.

Cool, thanks!

My first note on this patchset is following. These patches touches sensitive aspects or GiST and are related to complex concurrency issues.

Indeed! That is why I've divided patches: first one implements very simple scan algorithm with concurrency and recovery, second replaces simple algorithm with two more tricky algorithms.

Thus, I'm sure both of them deserve high-level description in src/backend/access/gist/README. Given this description, it would be much easier to review the patches.

Yes, that's definitely true. Please find README patch attached.

Best regards, Andrey Borodin.

Attachments:

0003-Update-README-with-info-on-new-GiST-VACUUM.patchapplication/octet-stream; name=0003-Update-README-with-info-on-new-GiST-VACUUM.patch; x-unix-mode=0644Download
From d7652b6664ac1c965ba86d374a94fdeb7691fbb1 Mon Sep 17 00:00:00 2001
From: Andrey Borodin <amborodin@acm.org>
Date: Tue, 16 Jan 2018 21:37:51 +0500
Subject: [PATCH 3/3] Update README with info on new GiST VACUUM

---
 src/backend/access/gist/README | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/src/backend/access/gist/README b/src/backend/access/gist/README
index 02228662b8..6b2b12ee96 100644
--- a/src/backend/access/gist/README
+++ b/src/backend/access/gist/README
@@ -413,6 +413,41 @@ emptied yet; tuples never move upwards in the tree. The final emptying loops
 through buffers at a given level until all buffers at that level have been
 emptied, and then moves down to the next level.
 
+Bulk delete algorithm (VACUUM)
+------------------------------
+
+Function gistbulkdelete() is responsible for marking empty leaf pages as free
+so that they can be used it allocate newly split pages. To find this pages
+function can choose between two strategies: logical scan or physical scan.
+
+Physical scan reads the entire index from the first page to last. This scan
+maintains graph structure in palloc'ed array to collect block numbers of
+internal pages that need cleansing from references to empty leafs. Also, the
+array contains offsets on the internal page to potentially free leaf page. This
+scan method is chosen when maintenance work memory is sufficient to hold
+necessary graph structure.
+
+The logical scan is chosen when there is not enough maintenance memory to
+execute the physical scan. Logical scan traverses GiST index in BFS, looking up
+into incomplete split branches. The logical scan can be slower on hard disk
+drives.
+
+The result of both scans are the same: the stack of block numbers of internal
+pages with the list of offsets potentially referencing empty leaf pages. After
+the scan, for each internal pages under exclusive lock, each potentially free
+leaf page is examined. gistbulkdelete() never delete last one reference from
+internal page to preserve balanced tree properties.
+
+The physical scan can return empty leaf pages offsets unordered. Thus, before
+executing PageIndexMultiDelete offsets (already locked and checked) are sorted.
+This step is not necessary for the logical scan.
+
+Both scans hold only one lock at a time. Physical scan grabs exclusive lock
+instantly, while logical scan takes shared lock and then swaps it to exclusive.
+This is done because amount of work on internal page done by physical scan is
+lower and amount of internal pages is relatively low compared to the amount of
+leaf pages.
+
 
 Authors:
 	Teodor Sigaev	<teodor@sigaev.ru>
-- 
2.14.3 (Apple Git-98)

#14Andrey Borodin
x4mmm@yandex-team.ru
In reply to: Andrey Borodin (#13)
3 attachment(s)
Re: New gist vacuum.

Hello, Alexander!

16 янв. 2018 г., в 21:42, Andrey Borodin <x4mmm@yandex-team.ru> написал(а):
Please find README patch attached.

Here's v2 version. Same code, but x2 comments. Also fixed important typo in readme BFS->DFS. Feel free to ping me any time with questions.

Best regards, Andrey Borodin.

Attachments:

0002-Physical-GiSC-scan-during-VACUUM-v2.patchapplication/octet-stream; name=0002-Physical-GiSC-scan-during-VACUUM-v2.patch; x-unix-mode=0644Download
From 8f02384366186809e25c4fab56e79990ac9c1c5d Mon Sep 17 00:00:00 2001
From: Andrey Borodin <amborodin@acm.org>
Date: Sun, 21 Jan 2018 15:06:39 +0500
Subject: [PATCH 2/3] Physical GiSC scan during VACUUM v2

---
 src/backend/access/gist/gistvacuum.c | 337 ++++++++++++++++++++++++++++++-----
 1 file changed, 297 insertions(+), 40 deletions(-)

diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index 235c51c71b..6ca740f5ae 100644
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@@ -103,8 +103,9 @@ gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
 typedef struct GistBDItem
 {
-	GistNSN		parentlsn;
-	BlockNumber blkno;
+	GistNSN		 parentlsn;
+	BlockNumber  blkno;
+	OffsetNumber parentoffset;
 	struct GistBDItem *next;
 } GistBDItem;
 
@@ -129,30 +130,204 @@ pushStackIfSplited(Page page, GistBDItem *stack)
 }
 
 /*
- * Bulk deletion of all index entries pointing to a set of heap tuples and
- * check invalid tuples left after upgrade.
- * The set of target tuples is specified via a callback routine that tells
- * whether any given heap tuple (identified by ItemPointer) is being deleted.
- *
- * Result: a palloc'd struct containing statistical info for VACUUM displays.
+ * During physical scan for every pair parent-child we can either find parent
+ * first or child first. Every time we open internal page - we mark parent
+ * block no for every child and set GIST_PS_HAS_PARENT. When scan will get to
+ * child page, if this page turns out to be empty - we will get back by
+ * parent link. If we find child first (still without parent link), we mark
+ * the page as GIST_PS_EMPTY_LEAF if it is ready to be deleted. When we will
+ * scan it's parent - we will pick it to rescan list.
  */
-IndexBulkDeleteResult *
-gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkDeleteCallback callback, void* callback_state)
+#define GIST_PS_HAS_PARENT 1
+#define GIST_PS_EMPTY_LEAF 2
+
+
+/* Physiscal scan item */
+typedef struct GistPSItem
 {
-	Relation	rel = info->index;
-	GistBDItem *stack,
-			   *ptr;
-	BlockNumber recentParent = InvalidBlockNumber;
-	List	   *rescanList = NULL;
-	ListCell   *cell;
+	BlockNumber  parent;
+	List*        emptyLeafOffsets;
+	OffsetNumber parentOffset;
+	uint16_t     flags;
+} GistPSItem;
+
+/* Blocknumber of internal pages with offsets to rescan for deletion */
+typedef struct GistRescanItem
+{
+	BlockNumber       blkno;
+	List*             emptyLeafOffsets;
+	struct GistRescanItem* next;
+} GistRescanItem;
+
+/* Read all pages sequentially populating array of GistPSItem */
+static GistRescanItem*
+gistbulkdeletephysicalcan(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkDeleteCallback callback, void* callback_state, BlockNumber npages)
+{
+	Relation	     rel = info->index;
+	GistRescanItem *result = NULL;
+	BlockNumber      blkno;
 
-	/* first time through? */
-	if (stats == NULL)
-		stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
-	/* we'll re-count the tuples each time */
-	stats->estimated_count = false;
-	stats->num_index_tuples = 0;
+	/* Here we will store whole graph of the index */
+	GistPSItem *graph = palloc0(npages * sizeof(GistPSItem));
+
+
+	for (blkno = GIST_ROOT_BLKNO; blkno < npages; blkno++)
+	{
+		Buffer		 buffer;
+		Page		 page;
+		OffsetNumber i,
+					 maxoff;
+		IndexTuple   idxtuple;
+		ItemId	     iid;
+
+		vacuum_delay_point();
+
+		buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
+									info->strategy);
+		/*
+		 * We are not going to stay here for a long time, calling recursive algorithms.
+		 * Especially for an internal page. So, agressivly grab an exclusive lock.
+		 */
+		LockBuffer(buffer, GIST_EXCLUSIVE);
+		page = (Page) BufferGetPage(buffer);
+
+		if (PageIsNew(page) || GistPageIsDeleted(page))
+		{
+			UnlockReleaseBuffer(buffer);
+			/* TODO: Should not we record free page here? */
+			continue;
+		}
+
+		maxoff = PageGetMaxOffsetNumber(page);
+
+		if (GistPageIsLeaf(page))
+		{
+			OffsetNumber todelete[MaxOffsetNumber];
+			int			ntodelete = 0;
+
+			/*
+			 * Remove deletable tuples from page
+			 */
 
+			for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
+			{
+				iid = PageGetItemId(page, i);
+				idxtuple = (IndexTuple) PageGetItem(page, iid);
+
+				if (callback(&(idxtuple->t_tid), callback_state))
+					todelete[ntodelete++] = i;
+				else
+					stats->num_index_tuples += 1;
+			}
+
+			stats->tuples_removed += ntodelete;
+
+			/* We have dead tuples on the page */
+			if (ntodelete)
+			{
+				START_CRIT_SECTION();
+
+				MarkBufferDirty(buffer);
+
+				PageIndexMultiDelete(page, todelete, ntodelete);
+				GistMarkTuplesDeleted(page);
+
+				if (RelationNeedsWAL(rel))
+				{
+					XLogRecPtr	recptr;
+
+					recptr = gistXLogUpdate(buffer,
+											todelete, ntodelete,
+											NULL, 0, InvalidBuffer);
+					PageSetLSN(page, recptr);
+				}
+				else
+					PageSetLSN(page, gistGetFakeLSN(rel));
+
+				END_CRIT_SECTION();
+			}
+
+			/* The page is completely empty */
+			if (ntodelete == maxoff)
+			{
+				/* This page is a candidate to be deleted. Remember it's parent to rescan it later with xlock */
+				if (graph[blkno].flags & GIST_PS_HAS_PARENT)
+				{
+					/* Go to parent and append myself */
+					BlockNumber parentblockno = graph[blkno].parent;
+					graph[parentblockno].emptyLeafOffsets = lappend_int(graph[parentblockno].emptyLeafOffsets, (int)graph[blkno].parentOffset);
+				}
+				else
+				{
+					/* Parent will collect me later */
+					graph[blkno].flags |= GIST_PS_EMPTY_LEAF;
+				}
+			}
+		}
+		else
+		{
+			/* For internal pages we remember stucture of the tree */
+			for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
+			{
+				BlockNumber childblkno;
+				iid = PageGetItemId(page, i);
+				idxtuple = (IndexTuple) PageGetItem(page, iid);
+				childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
+
+				if (graph[childblkno].flags & GIST_PS_EMPTY_LEAF)
+				{
+					/* Child has been scanned earlier and is ready to be picked up */
+					graph[blkno].emptyLeafOffsets = lappend_int(graph[blkno].emptyLeafOffsets, i);
+				}
+				else
+				{
+					/* Collect leaf when scan will come close */
+					graph[childblkno].parent = blkno;
+					graph[childblkno].parentOffset = i;
+					graph[childblkno].flags |= GIST_PS_HAS_PARENT;
+				}
+
+
+				if (GistTupleIsInvalid(idxtuple))
+					ereport(LOG,
+							(errmsg("index \"%s\" contains an inner tuple marked as invalid",
+									RelationGetRelationName(rel)),
+							 errdetail("This is caused by an incomplete page split at crash recovery before upgrading to PostgreSQL 9.1."),
+							 errhint("Please REINDEX it.")));
+			}
+		}
+		UnlockReleaseBuffer(buffer);
+	}
+
+	/* Search for internal pages pointing to empty leafs */
+	for (blkno = GIST_ROOT_BLKNO; blkno < npages; blkno++)
+	{
+		if (graph[blkno].emptyLeafOffsets)
+		{
+			GistRescanItem *next = palloc(sizeof(GistRescanItem));
+			next->blkno = blkno;
+			next->emptyLeafOffsets = graph[blkno].emptyLeafOffsets;
+			next->next = result;
+			result = next;
+		}
+	}
+
+	pfree(graph);
+
+	return result;
+}
+
+/* Logical scan descends from root to leafs in DFS search */
+static GistRescanItem*
+gistbulkdeletelogicalscan(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkDeleteCallback callback, void* callback_state)
+{
+	Relation        rel = info->index;
+	BlockNumber     recentParent = InvalidBlockNumber;
+	GistBDItem     *stack,
+				   *ptr;
+	GistRescanItem *result = NULL;
+
+	/* This stack is used to organize DFS */
 	stack = (GistBDItem *) palloc0(sizeof(GistBDItem));
 	stack->blkno = GIST_ROOT_BLKNO;
 
@@ -237,11 +412,18 @@ gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkD
 				END_CRIT_SECTION();
 			}
 
-			if (ntodelete == maxoff && recentParent!=InvalidBlockNumber &&
-				(rescanList == NULL || (BlockNumber)llast_int(rescanList) != recentParent))
+			if (ntodelete == maxoff && recentParent!=InvalidBlockNumber)
 			{
 				/* This page is a candidate to be deleted. Remember it's parent to rescan it later with xlock */
-				rescanList = lappend_int(rescanList, recentParent);
+				if (result == NULL || result->blkno != recentParent)
+				{
+					GistRescanItem *next = palloc(sizeof(GistRescanItem));
+					next->blkno = recentParent;
+					next->emptyLeafOffsets = NULL;
+					next->next = result;
+					result = next;
+				}
+				result->emptyLeafOffsets = lappend_int(result->emptyLeafOffsets, stack->parentoffset);
 			}
 		}
 		else
@@ -261,6 +443,7 @@ gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkD
 				ptr->blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
 				ptr->parentlsn = BufferGetLSNAtomic(buffer);
 				ptr->next = stack->next;
+				ptr->parentoffset = i;
 				stack->next = ptr;
 
 				if (GistTupleIsInvalid(idxtuple))
@@ -281,20 +464,82 @@ gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkD
 		vacuum_delay_point();
 	}
 
-	/* rescan inner pages that had empty child pages */
-	foreach(cell,rescanList)
+	return result;
+}
+
+/*
+ * This function is used to sort offsets for PageIndexMultiDelete
+ * When employing physical scan rescan offsets are not ordered.
+ */
+static int
+compare_offsetnumber(const void *x, const void *y)
+{
+	OffsetNumber a = *((OffsetNumber *)x);
+	OffsetNumber b = *((OffsetNumber *)y);
+	return a - b;
+}
+
+/*
+ * Bulk deletion of all index entries pointing to a set of heap tuples and
+ * check invalid tuples left after upgrade.
+ * The set of target tuples is specified via a callback routine that tells
+ * whether any given heap tuple (identified by ItemPointer) is being deleted.
+ *
+ * Result: a palloc'd struct containing statistical info for VACUUM displays.
+ */
+IndexBulkDeleteResult *
+gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkDeleteCallback callback, void* callback_state)
+{
+	Relation		rel = info->index;
+	GistRescanItem *rescan;
+	BlockNumber		npages;
+	bool			needLock;
+
+	/* first time through? */
+	if (stats == NULL)
+		stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
+	/* we'll re-count the tuples each time */
+	stats->estimated_count = false;
+	stats->num_index_tuples = 0;
+
+	/*
+	 * Need lock unless it's local to this backend.
+	 */
+	needLock = !RELATION_IS_LOCAL(rel);
+
+	/* try to find deleted pages */
+	if (needLock)
+		LockRelationForExtension(rel, ExclusiveLock);
+	npages = RelationGetNumberOfBlocks(rel);
+	if (needLock)
+		UnlockRelationForExtension(rel, ExclusiveLock);
+
+	/* If we have enough space to contruct map of whole graph, then we can do sequential reading of all index */
+	if (npages * (sizeof(GistPSItem)) > maintenance_work_mem * 1024)
 	{
-		Buffer		buffer;
-		Page		page;
-		OffsetNumber i,
-					maxoff;
-		IndexTuple	idxtuple;
-		ItemId		iid;
-		OffsetNumber todelete[MaxOffsetNumber];
-		Buffer		buftodelete[MaxOffsetNumber];
-		int			ntodelete = 0;
+		rescan = gistbulkdeletelogicalscan(info, stats, callback, callback_state);
+	}
+	else
+	{
+		rescan = gistbulkdeletephysicalcan(info, stats, callback, callback_state, npages);
+	}
 
-		buffer = ReadBufferExtended(rel, MAIN_FORKNUM, (BlockNumber)lfirst_int(cell),
+	/* rescan inner pages that had empty child pages */
+	while (rescan)
+	{
+		Buffer			 buffer;
+		Page			 page;
+		OffsetNumber 	 i,
+						 maxoff;
+		IndexTuple		 idxtuple;
+		ItemId			 iid;
+		OffsetNumber 	 todelete[MaxOffsetNumber];
+		Buffer			 buftodelete[MaxOffsetNumber];
+		int				 ntodelete = 0;
+		ListCell  		*cell;
+		GistRescanItem	*oldRescan;
+
+		buffer = ReadBufferExtended(rel, MAIN_FORKNUM, rescan->blkno,
 									RBM_NORMAL, info->strategy);
 		LockBuffer(buffer, GIST_EXCLUSIVE);
 		gistcheckpage(rel, buffer);
@@ -304,11 +549,18 @@ gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkD
 
 		maxoff = PageGetMaxOffsetNumber(page);
 
-		for (i = OffsetNumberNext(FirstOffsetNumber); i <= maxoff; i = OffsetNumberNext(i))
+		/* Check that leafs are still empty and decide what to delete */
+		foreach(cell, rescan->emptyLeafOffsets)
 		{
 			Buffer		leafBuffer;
 			Page		leafPage;
 
+			i = (OffsetNumber)lfirst_int(cell);
+			if(i > maxoff)
+			{
+				continue;
+			}
+
 			iid = PageGetItemId(page, i);
 			idxtuple = (IndexTuple) PageGetItem(page, iid);
 
@@ -338,7 +590,9 @@ gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkD
 			START_CRIT_SECTION();
 
 			MarkBufferDirty(buffer);
-				PageIndexMultiDelete(page, todelete, ntodelete);
+			/* Prepare possibly onurdered offsets */
+			qsort(todelete, ntodelete, sizeof(OffsetNumber), compare_offsetnumber);
+			PageIndexMultiDelete(page, todelete, ntodelete);
 
 			if (RelationNeedsWAL(rel))
 			{
@@ -376,11 +630,14 @@ gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkD
 		}
 
 		UnlockReleaseBuffer(buffer);
+		oldRescan = rescan;
+		rescan = rescan->next;
+		list_free(oldRescan->emptyLeafOffsets);
+		pfree(oldRescan);
 
 		vacuum_delay_point();
 	}
 
-	list_free(rescanList);
 
 	return stats;
 }
\ No newline at end of file
-- 
2.14.3 (Apple Git-98)

0003-Update-README-with-info-on-new-GiST-VACUUM-v2.patchapplication/octet-stream; name=0003-Update-README-with-info-on-new-GiST-VACUUM-v2.patch; x-unix-mode=0644Download
From 9f4aeae9a330081b903456ca6d1fcac8cc54a732 Mon Sep 17 00:00:00 2001
From: Andrey Borodin <amborodin@acm.org>
Date: Sun, 21 Jan 2018 15:07:55 +0500
Subject: [PATCH 3/3] Update README with info on new GiST VACUUM v2

---
 src/backend/access/gist/README | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/src/backend/access/gist/README b/src/backend/access/gist/README
index 02228662b8..9548872be8 100644
--- a/src/backend/access/gist/README
+++ b/src/backend/access/gist/README
@@ -413,6 +413,41 @@ emptied yet; tuples never move upwards in the tree. The final emptying loops
 through buffers at a given level until all buffers at that level have been
 emptied, and then moves down to the next level.
 
+Bulk delete algorithm (VACUUM)
+------------------------------
+
+Function gistbulkdelete() is responsible for marking empty leaf pages as free
+so that they can be used it allocate newly split pages. To find this pages
+function can choose between two strategies: logical scan or physical scan.
+
+Physical scan reads the entire index from the first page to last. This scan
+maintains graph structure in palloc'ed array to collect block numbers of
+internal pages that need cleansing from references to empty leafs. Also, the
+array contains offsets on the internal page to potentially free leaf page. This
+scan method is chosen when maintenance work memory is sufficient to hold
+necessary graph structure.
+
+The logical scan is chosen when there is not enough maintenance memory to
+execute the physical scan. Logical scan traverses GiST index in DFS, looking up
+into incomplete split branches. The logical scan can be slower on hard disk
+drives.
+
+The result of both scans are the same: the stack of block numbers of internal
+pages with the list of offsets potentially referencing empty leaf pages. After
+the scan, for each internal pages under exclusive lock, each potentially free
+leaf page is examined. gistbulkdelete() never delete last one reference from
+internal page to preserve balanced tree properties.
+
+The physical scan can return empty leaf pages offsets unordered. Thus, before
+executing PageIndexMultiDelete offsets (already locked and checked) are sorted.
+This step is not necessary for the logical scan.
+
+Both scans hold only one lock at a time. Physical scan grabs exclusive lock
+instantly, while logical scan takes shared lock and then swaps it to exclusive.
+This is done because amount of work on internal page done by physical scan is
+lower and amount of internal pages is relatively low compared to the amount of
+leaf pages.
+
 
 Authors:
 	Teodor Sigaev	<teodor@sigaev.ru>
-- 
2.14.3 (Apple Git-98)

0001-Delete-pages-during-GiST-VACUUM-v2.patchapplication/octet-stream; name=0001-Delete-pages-during-GiST-VACUUM-v2.patch; x-unix-mode=0644Download
From 9cc77f50747db58014dae482fe7a9d6b306b0c0b Mon Sep 17 00:00:00 2001
From: Andrey Borodin <amborodin@acm.org>
Date: Sun, 21 Jan 2018 14:43:50 +0500
Subject: [PATCH 1/3] Delete pages during GiST VACUUM v2

---
 src/backend/access/gist/gist.c       |   7 +++
 src/backend/access/gist/gistbuild.c  |   5 --
 src/backend/access/gist/gistutil.c   |   4 +-
 src/backend/access/gist/gistvacuum.c | 119 +++++++++++++++++++++++++++++++++--
 src/backend/access/gist/gistxlog.c   |  47 +++++++++++++-
 src/include/access/gist_private.h    |  23 ++++---
 src/include/access/gistxlog.h        |  16 ++++-
 src/test/regress/expected/gist.out   |   4 +-
 src/test/regress/sql/gist.sql        |   4 +-
 9 files changed, 203 insertions(+), 26 deletions(-)

diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index 51c32e4afe..f7973a2e15 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -694,6 +694,13 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate)
 			GISTInsertStack *item;
 			OffsetNumber downlinkoffnum;
 
+			if(GistPageIsDeleted(stack->page))
+			{
+				UnlockReleaseBuffer(stack->buffer);
+				xlocked = false;
+				state.stack = stack = stack->parent;
+				continue;
+			}
 			downlinkoffnum = gistchoose(state.r, stack->page, itup, giststate);
 			iid = PageGetItemId(stack->page, downlinkoffnum);
 			idxtuple = (IndexTuple) PageGetItem(stack->page, iid);
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
index d22318a5f1..abb9a642c6 100644
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -1126,11 +1126,6 @@ gistGetMaxLevel(Relation index)
  * but will be added there the first time we visit them.
  */
 
-typedef struct
-{
-	BlockNumber childblkno;		/* hash key */
-	BlockNumber parentblkno;
-} ParentMapEntry;
 
 static void
 gistInitParentMap(GISTBuildState *buildstate)
diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c
index 55cccd247a..52c1b6fa69 100644
--- a/src/backend/access/gist/gistutil.c
+++ b/src/backend/access/gist/gistutil.c
@@ -24,6 +24,7 @@
 #include "storage/lmgr.h"
 #include "utils/builtins.h"
 #include "utils/syscache.h"
+#include "utils/snapmgr.h"
 
 
 /*
@@ -801,13 +802,14 @@ gistNewBuffer(Relation r)
 		if (ConditionalLockBuffer(buffer))
 		{
 			Page		page = BufferGetPage(buffer);
+			PageHeader	p = (PageHeader) page;
 
 			if (PageIsNew(page))
 				return buffer;	/* OK to use, if never initialized */
 
 			gistcheckpage(r, buffer);
 
-			if (GistPageIsDeleted(page))
+			if (GistPageIsDeleted(page) && TransactionIdPrecedes(p->pd_prune_xid, RecentGlobalDataXmin))
 				return buffer;	/* OK to use */
 
 			LockBuffer(buffer, GIST_UNLOCK);
diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index 22181c6299..235c51c71b 100644
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@@ -20,6 +20,8 @@
 #include "miscadmin.h"
 #include "storage/indexfsm.h"
 #include "storage/lmgr.h"
+#include "utils/snapmgr.h"
+#include "access/xact.h"
 
 
 /*
@@ -126,7 +128,6 @@ pushStackIfSplited(Page page, GistBDItem *stack)
 	}
 }
 
-
 /*
  * Bulk deletion of all index entries pointing to a set of heap tuples and
  * check invalid tuples left after upgrade.
@@ -136,12 +137,14 @@ pushStackIfSplited(Page page, GistBDItem *stack)
  * Result: a palloc'd struct containing statistical info for VACUUM displays.
  */
 IndexBulkDeleteResult *
-gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
-			   IndexBulkDeleteCallback callback, void *callback_state)
+gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkDeleteCallback callback, void* callback_state)
 {
 	Relation	rel = info->index;
 	GistBDItem *stack,
 			   *ptr;
+	BlockNumber recentParent = InvalidBlockNumber;
+	List	   *rescanList = NULL;
+	ListCell   *cell;
 
 	/* first time through? */
 	if (stats == NULL)
@@ -234,9 +237,16 @@ gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 				END_CRIT_SECTION();
 			}
 
+			if (ntodelete == maxoff && recentParent!=InvalidBlockNumber &&
+				(rescanList == NULL || (BlockNumber)llast_int(rescanList) != recentParent))
+			{
+				/* This page is a candidate to be deleted. Remember it's parent to rescan it later with xlock */
+				rescanList = lappend_int(rescanList, recentParent);
+			}
 		}
 		else
 		{
+			recentParent = stack->blkno;
 			/* check for split proceeded after look at parent */
 			pushStackIfSplited(page, stack);
 
@@ -271,5 +281,106 @@ gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 		vacuum_delay_point();
 	}
 
+	/* rescan inner pages that had empty child pages */
+	foreach(cell,rescanList)
+	{
+		Buffer		buffer;
+		Page		page;
+		OffsetNumber i,
+					maxoff;
+		IndexTuple	idxtuple;
+		ItemId		iid;
+		OffsetNumber todelete[MaxOffsetNumber];
+		Buffer		buftodelete[MaxOffsetNumber];
+		int			ntodelete = 0;
+
+		buffer = ReadBufferExtended(rel, MAIN_FORKNUM, (BlockNumber)lfirst_int(cell),
+									RBM_NORMAL, info->strategy);
+		LockBuffer(buffer, GIST_EXCLUSIVE);
+		gistcheckpage(rel, buffer);
+		page = (Page) BufferGetPage(buffer);
+
+		Assert(!GistPageIsLeaf(page));
+
+		maxoff = PageGetMaxOffsetNumber(page);
+
+		for (i = OffsetNumberNext(FirstOffsetNumber); i <= maxoff; i = OffsetNumberNext(i))
+		{
+			Buffer		leafBuffer;
+			Page		leafPage;
+
+			iid = PageGetItemId(page, i);
+			idxtuple = (IndexTuple) PageGetItem(page, iid);
+
+			leafBuffer = ReadBufferExtended(rel, MAIN_FORKNUM, ItemPointerGetBlockNumber(&(idxtuple->t_tid)),
+								RBM_NORMAL, info->strategy);
+			LockBuffer(leafBuffer, GIST_EXCLUSIVE);
+			gistcheckpage(rel, leafBuffer);
+			leafPage = (Page) BufferGetPage(leafBuffer);
+			Assert(GistPageIsLeaf(leafPage));
+
+			if (PageGetMaxOffsetNumber(leafPage) == InvalidOffsetNumber /* Nothing left to split */
+				&& !(GistFollowRight(leafPage) || GistPageGetNSN(page) < GistPageGetNSN(leafPage)) /* No follow-right */
+				&& ntodelete < maxoff-1) /* We must keep at least one leaf page per each */
+			{
+				buftodelete[ntodelete] = leafBuffer;
+				todelete[ntodelete++] = i;
+			}
+			else
+				UnlockReleaseBuffer(leafBuffer);
+		}
+
+
+		if (ntodelete)
+		{
+			/* Drop references from internal page */
+			TransactionId txid = GetCurrentTransactionId();
+			START_CRIT_SECTION();
+
+			MarkBufferDirty(buffer);
+				PageIndexMultiDelete(page, todelete, ntodelete);
+
+			if (RelationNeedsWAL(rel))
+			{
+				XLogRecPtr	recptr;
+
+				recptr = gistXLogUpdate(buffer, todelete, ntodelete, NULL, 0, InvalidBuffer);
+					PageSetLSN(page, recptr);
+			}
+			else
+				PageSetLSN(page, gistGetFakeLSN(rel));
+
+			/* Mark pages as deleted */
+			for (i = 0; i < ntodelete; i++)
+			{
+				Page		leafPage = (Page)BufferGetPage(buftodelete[i]);
+				PageHeader	header = (PageHeader)leafPage;
+
+				header->pd_prune_xid = txid;
+
+				GistPageSetDeleted(leafPage);
+				MarkBufferDirty(buftodelete[i]);
+				stats->pages_deleted++;
+
+				if (RelationNeedsWAL(rel))
+				{
+					XLogRecPtr recptr = gistXLogSetDeleted(rel->rd_node, buftodelete[i], header->pd_prune_xid);
+					PageSetLSN(leafPage, recptr);
+				}
+				else
+					PageSetLSN(leafPage, gistGetFakeLSN(rel));
+
+				UnlockReleaseBuffer(buftodelete[i]);
+			}
+			END_CRIT_SECTION();
+		}
+
+		UnlockReleaseBuffer(buffer);
+
+		vacuum_delay_point();
+	}
+
+	list_free(rescanList);
+
 	return stats;
-}
+}
\ No newline at end of file
diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c
index 1e09126978..7d515f3e7e 100644
--- a/src/backend/access/gist/gistxlog.c
+++ b/src/backend/access/gist/gistxlog.c
@@ -60,6 +60,29 @@ gistRedoClearFollowRight(XLogReaderState *record, uint8 block_id)
 		UnlockReleaseBuffer(buffer);
 }
 
+static void
+gistRedoPageSetDeleted(XLogReaderState *record)
+{
+	XLogRecPtr	lsn = record->EndRecPtr;
+	gistxlogPageDelete *xldata = (gistxlogPageDelete *) XLogRecGetData(record);
+	Buffer		buffer;
+	Page		page;
+	PageHeader		header;
+
+	if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
+	{
+		page = (Page) BufferGetPage(buffer);
+		header = (PageHeader) page;
+
+		header->pd_prune_xid = xldata->id;
+		GistPageSetDeleted(page);
+
+		PageSetLSN(page, lsn);
+		MarkBufferDirty(buffer);
+
+		UnlockReleaseBuffer(buffer);
+	}
+}
 /*
  * redo any page update (except page split)
  */
@@ -112,8 +135,8 @@ gistRedoPageUpdateRecord(XLogReaderState *record)
 			data += sizeof(OffsetNumber) * xldata->ntodelete;
 
 			PageIndexMultiDelete(page, todelete, xldata->ntodelete);
-			if (GistPageIsLeaf(page))
-				GistMarkTuplesDeleted(page);
+
+			GistMarkTuplesDeleted(page);
 		}
 
 		/* Add new tuples if any */
@@ -324,6 +347,9 @@ gist_redo(XLogReaderState *record)
 		case XLOG_GIST_CREATE_INDEX:
 			gistRedoCreateIndex(record);
 			break;
+		case XLOG_GIST_PAGE_DELETE:
+			gistRedoPageSetDeleted(record);
+			break;
 		default:
 			elog(PANIC, "gist_redo: unknown op code %u", info);
 	}
@@ -442,6 +468,23 @@ gistXLogSplit(bool page_is_leaf,
 	return recptr;
 }
 
+
+XLogRecPtr
+gistXLogSetDeleted(RelFileNode node, Buffer buffer, TransactionId xid) {
+	gistxlogPageDelete xlrec;
+	XLogRecPtr	recptr;
+
+	xlrec.id = xid;
+
+	XLogBeginInsert();
+	XLogRegisterData((char *) &xlrec, sizeof(gistxlogPageDelete));
+
+	XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
+	/* new tuples */
+	recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_DELETE);
+	return recptr;
+}
+
 /*
  * Write XLOG record describing a page update. The update can include any
  * number of deletions and/or insertions of tuples on a single index page.
diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h
index 36ed7244ba..fe4b87084e 100644
--- a/src/include/access/gist_private.h
+++ b/src/include/access/gist_private.h
@@ -16,6 +16,7 @@
 
 #include "access/amapi.h"
 #include "access/gist.h"
+#include "access/gistxlog.h"
 #include "access/itup.h"
 #include "fmgr.h"
 #include "lib/pairingheap.h"
@@ -51,6 +52,11 @@ typedef struct
 	char		tupledata[FLEXIBLE_ARRAY_MEMBER];
 } GISTNodeBufferPage;
 
+typedef struct
+{
+	BlockNumber childblkno;		/* hash key */
+	BlockNumber parentblkno;
+} ParentMapEntry;
 #define BUFFER_PAGE_DATA_OFFSET MAXALIGN(offsetof(GISTNodeBufferPage, tupledata))
 /* Returns free space in node buffer page */
 #define PAGE_FREE_SPACE(nbp) (nbp->freespace)
@@ -176,13 +182,6 @@ typedef struct GISTScanOpaqueData
 
 typedef GISTScanOpaqueData *GISTScanOpaque;
 
-/* despite the name, gistxlogPage is not part of any xlog record */
-typedef struct gistxlogPage
-{
-	BlockNumber blkno;
-	int			num;			/* number of index tuples following */
-} gistxlogPage;
-
 /* SplitedPageLayout - gistSplit function result */
 typedef struct SplitedPageLayout
 {
@@ -409,6 +408,16 @@ extern bool gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
 extern SplitedPageLayout *gistSplit(Relation r, Page page, IndexTuple *itup,
 		  int len, GISTSTATE *giststate);
 
+/* gistxlog.c */
+extern void gist_redo(XLogReaderState *record);
+extern void gist_desc(StringInfo buf, XLogReaderState *record);
+extern const char *gist_identify(uint8 info);
+extern void gist_xlog_startup(void);
+extern void gist_xlog_cleanup(void);
+
+extern XLogRecPtr gistXLogSetDeleted(RelFileNode node, Buffer buffer,
+					TransactionId xid);
+
 extern XLogRecPtr gistXLogUpdate(Buffer buffer,
 			   OffsetNumber *todelete, int ntodelete,
 			   IndexTuple *itup, int ntup,
diff --git a/src/include/access/gistxlog.h b/src/include/access/gistxlog.h
index 1a2b9496d0..8df7f4064d 100644
--- a/src/include/access/gistxlog.h
+++ b/src/include/access/gistxlog.h
@@ -17,12 +17,14 @@
 #include "access/xlogreader.h"
 #include "lib/stringinfo.h"
 
+/* XLog stuff */
+
 #define XLOG_GIST_PAGE_UPDATE		0x00
  /* #define XLOG_GIST_NEW_ROOT			 0x20 */	/* not used anymore */
 #define XLOG_GIST_PAGE_SPLIT		0x30
  /* #define XLOG_GIST_INSERT_COMPLETE	 0x40 */	/* not used anymore */
 #define XLOG_GIST_CREATE_INDEX		0x50
- /* #define XLOG_GIST_PAGE_DELETE		 0x60 */	/* not used anymore */
+#define XLOG_GIST_PAGE_DELETE		 0x60
 
 /*
  * Backup Blk 0: updated page.
@@ -59,6 +61,18 @@ typedef struct gistxlogPageSplit
 	 */
 } gistxlogPageSplit;
 
+typedef struct gistxlogPageDelete
+{
+   TransactionId id;
+} gistxlogPageDelete;
+
+/* despite the name, gistxlogPage is not part of any xlog record */
+typedef struct gistxlogPage
+{
+   BlockNumber blkno;
+   int			num;			/* number of index tuples following */
+} gistxlogPage;
+
 extern void gist_redo(XLogReaderState *record);
 extern void gist_desc(StringInfo buf, XLogReaderState *record);
 extern const char *gist_identify(uint8 info);
diff --git a/src/test/regress/expected/gist.out b/src/test/regress/expected/gist.out
index f5a2993aaf..5b92f08c74 100644
--- a/src/test/regress/expected/gist.out
+++ b/src/test/regress/expected/gist.out
@@ -27,9 +27,7 @@ insert into gist_point_tbl (id, p)
 select g+100000, point(g*10+1, g*10+1) from generate_series(1, 10000) g;
 -- To test vacuum, delete some entries from all over the index.
 delete from gist_point_tbl where id % 2 = 1;
--- And also delete some concentration of values. (GiST doesn't currently
--- attempt to delete pages even when they become empty, but if it did, this
--- would exercise it)
+-- And also delete some concentration of values.
 delete from gist_point_tbl where id < 10000;
 vacuum analyze gist_point_tbl;
 -- rebuild the index with a different fillfactor
diff --git a/src/test/regress/sql/gist.sql b/src/test/regress/sql/gist.sql
index bae722fe13..e66396e851 100644
--- a/src/test/regress/sql/gist.sql
+++ b/src/test/regress/sql/gist.sql
@@ -28,9 +28,7 @@ select g+100000, point(g*10+1, g*10+1) from generate_series(1, 10000) g;
 -- To test vacuum, delete some entries from all over the index.
 delete from gist_point_tbl where id % 2 = 1;
 
--- And also delete some concentration of values. (GiST doesn't currently
--- attempt to delete pages even when they become empty, but if it did, this
--- would exercise it)
+-- And also delete some concentration of values.
 delete from gist_point_tbl where id < 10000;
 
 vacuum analyze gist_point_tbl;
-- 
2.14.3 (Apple Git-98)

#15David Steele
david@pgmasters.net
In reply to: Andrey Borodin (#14)
Re: Re: New gist vacuum.

Hi Andrey,

On 1/21/18 5:34 AM, Andrey Borodin wrote:

Hello, Alexander!

16 янв. 2018 г., в 21:42, Andrey Borodin <x4mmm@yandex-team.ru> написал(а):
Please find README patch attached.

Here's v2 version. Same code, but x2 comments. Also fixed important typo in readme BFS->DFS. Feel free to ping me any time with questions.

This patch has not gotten review and does not seem like a good fit for
the last PG11 CF so I am marking it Returned with Feedback.

Regards,
--
-David
david@pgmasters.net

#16Andrey Borodin
x4mmm@yandex-team.ru
In reply to: David Steele (#15)
Re: New gist vacuum.

Hi, David!

7 февр. 2018 г., в 18:39, David Steele <david@pgmasters.net> написал(а):

Hi Andrey,

On 1/21/18 5:34 AM, Andrey Borodin wrote:

Hello, Alexander!

16 янв. 2018 г., в 21:42, Andrey Borodin <x4mmm@yandex-team.ru> написал(а):
Please find README patch attached.

Here's v2 version. Same code, but x2 comments. Also fixed important typo in readme BFS->DFS. Feel free to ping me any time with questions.

This patch has not gotten review and does not seem like a good fit for
the last PG11 CF so I am marking it Returned with Feedback.

Why do you think this patch does not seem good fit for CF?

I've been talking with Alexander just yesterday at PgConf.Russia, and he was going to provide review.

Best regards, Andrey Borodin.

#17David Steele
david@pgmasters.net
In reply to: Andrey Borodin (#16)
Re: New gist vacuum.

Hi Andrey,

On 2/7/18 10:46 AM, Andrey Borodin wrote:

7 февр. 2018 г., в 18:39, David Steele <david@pgmasters.net> написал(а):

Hi Andrey,

On 1/21/18 5:34 AM, Andrey Borodin wrote:

Hello, Alexander!

16 янв. 2018 г., в 21:42, Andrey Borodin <x4mmm@yandex-team.ru> написал(а):
Please find README patch attached.

Here's v2 version. Same code, but x2 comments. Also fixed important typo in readme BFS->DFS. Feel free to ping me any time with questions.

This patch has not gotten review and does not seem like a good fit for
the last PG11 CF so I am marking it Returned with Feedback.

Why do you think this patch does not seem good fit for CF?

Apologies for the brevity. I had about 40 patches to go through yesterday.

The reason it does not seem a good fit is that it's a new, possibly
invasive patch that has not gotten any review in the last three CFs
since it was reintroduced. I'm not sure why that's the case and I have
no opinion about the patch itself, but there it is.

We try to avoid new patches in the last CF that could be destabilizing
and this patch appears to be in that category. I know it has been
around for a while, but the lack of review makes it "new" in the context
of the last CF for PG11.

I've been talking with Alexander just yesterday at PgConf.Russia, and he was going to provide review.

Great! I'd suggest you submit this patch for the CF after 2018-03.

However, that's completely up to you.

Regards,
--
-David
david@pgmasters.net

#18Michail Nikolaev
michail.nikolaev@gmail.com
In reply to: Alexander Korotkov (#10)
Re: New gist vacuum.

The following review has been posted through the commitfest application:
make installcheck-world: tested, passed
Implements feature: tested, passed
Spec compliant: not tested
Documentation: not tested

Hello.

I have added small change to patch to allow it be compiled using msvc (uint64_t -> uint64).
Everything seems to work, check-world is passing.

Actually patch fixes two issues:
1) Partial GIST indexes now have corrent tuples count estimation.
2) Now subsequent calls to VACUUM on GIST index (like "VACCUM table_name") do not change tuples count to estimated number of tuples in table (which is changed even without any updates in table due current implementation).

I think it is fine to commit.

Patch is also availble on github:
https://github.com/michail-nikolaev/postgres/commit/ff5171b586e4eb60ea5d15a18055d8ea4e44d244?ts=4

I'll attach patch file next message.

The new status of this patch is: Ready for Committer

#19Michail Nikolaev
michail.nikolaev@gmail.com
In reply to: Michail Nikolaev (#18)
1 attachment(s)
Re: New gist vacuum.

I'll attach patch file next message.

Updated patch is attached.

Attachments:

gist-vacuum-count.patchapplication/octet-stream; name=gist-vacuum-count.patchDownload
diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index 22181c6..21ddc18 100644
*** a/src/backend/access/gist/gistvacuum.c
--- b/src/backend/access/gist/gistvacuum.c
***************
*** 16,21 ****
--- 16,22 ----
  
  #include "access/genam.h"
  #include "access/gist_private.h"
+ #include "access/htup_details.h"
  #include "commands/vacuum.h"
  #include "miscadmin.h"
  #include "storage/indexfsm.h"
***************
*** 32,38 **** gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
  	BlockNumber npages,
  				blkno;
  	BlockNumber totFreePages;
! 	bool		needLock;
  
  	/* No-op in ANALYZE ONLY mode */
  	if (info->analyze_only)
--- 33,41 ----
  	BlockNumber npages,
  				blkno;
  	BlockNumber totFreePages;
! 	bool		needLock,
! 				shouldCount = false;
! 	uint64		tuplesCount = 0;
  
  	/* No-op in ANALYZE ONLY mode */
  	if (info->analyze_only)
***************
*** 45,55 **** gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
  		/* use heap's tuple count */
  		stats->num_index_tuples = info->num_heap_tuples;
  		stats->estimated_count = info->estimated_count;
! 
! 		/*
! 		 * XXX the above is wrong if index is partial.  Would it be OK to just
! 		 * return NULL, or is there work we must do below?
! 		 */
  	}
  
  	/*
--- 48,62 ----
  		/* use heap's tuple count */
  		stats->num_index_tuples = info->num_heap_tuples;
  		stats->estimated_count = info->estimated_count;
! 		/* unless it is estimate or index is partial  */
! 		if (rel->rd_indextuple == NULL)
! 			RelationInitIndexAccessInfo(rel);
! 		shouldCount = !heap_attisnull(rel->rd_indextuple, Anum_pg_index_indpred) ||
! 					   info->estimated_count;
! 	}
! 	else
! 	{
! 		shouldCount = stats->estimated_count;
  	}
  
  	/*
***************
*** 82,93 **** gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
--- 89,115 ----
  			totFreePages++;
  			RecordFreeIndexPage(rel, blkno);
  		}
+ 		else
+ 		{
+ 			/* Count tuples in leaf pages if needed */
+ 			if (shouldCount && GistPageIsLeaf(page))
+ 			{
+ 				tuplesCount += PageGetMaxOffsetNumber(page);
+ 			}
+ 		}
  		UnlockReleaseBuffer(buffer);
  	}
  
  	/* Finally, vacuum the FSM */
  	IndexFreeSpaceMapVacuum(info->index);
  
+ 	/* Update index tuples stat to counted over leaf pages if needed */
+ 	if (shouldCount)
+ 	{
+ 		stats->num_index_tuples = tuplesCount;
+ 		stats->estimated_count = false;
+ 	}
+ 
  	/* return statistics */
  	stats->pages_free = totFreePages;
  	if (needLock)
#20Tom Lane
tgl@sss.pgh.pa.us
In reply to: Michail Nikolaev (#18)
Re: New gist vacuum.

Michail Nikolaev <michail.nikolaev@gmail.com> writes:

I have added small change to patch to allow it be compiled using msvc (uint64_t -> uint64).
Everything seems to work, check-world is passing.

Actually patch fixes two issues:
1) Partial GIST indexes now have corrent tuples count estimation.
2) Now subsequent calls to VACUUM on GIST index (like "VACCUM table_name") do not change tuples count to estimated number of tuples in table (which is changed even without any updates in table due current implementation).

I think it is fine to commit.

I took a quick look at this. I wonder what is the point of making
the counting conditional. Since the function is visiting every
index page anyway, why not just always count and unconditionally
provide an exact answer? The number of cycles saved by skipping
"tuplesCount += PageGetMaxOffsetNumber(page)" on each leaf page
is surely trivial.

regards, tom lane

#21Andrey Borodin
x4mmm@yandex-team.ru
In reply to: Tom Lane (#20)
1 attachment(s)
Re: New gist vacuum.

1 марта 2018 г., в 22:44, Tom Lane <tgl@sss.pgh.pa.us> написал(а):

Michail Nikolaev <michail.nikolaev@gmail.com> writes:

I have added small change to patch to allow it be compiled using msvc (uint64_t -> uint64).
Everything seems to work, check-world is passing.

Actually patch fixes two issues:
1) Partial GIST indexes now have corrent tuples count estimation.
2) Now subsequent calls to VACUUM on GIST index (like "VACCUM table_name") do not change tuples count to estimated number of tuples in table (which is changed even without any updates in table due current implementation).

I think it is fine to commit.

I took a quick look at this. I wonder what is the point of making
the counting conditional. Since the function is visiting every
index page anyway, why not just always count and unconditionally
provide an exact answer? The number of cycles saved by skipping
"tuplesCount += PageGetMaxOffsetNumber(page)" on each leaf page
is surely trivial.

Thanks for looking into the patch, Tom!

I thought that it's a good idea to optimize out as many cycles as possible.
But, indeed, there are some reasons in favor of unconditional counting:
1. Code is cleaner, and this is not hot path
2. If we choose unconditional counting in gistvacuumcleanup() I'll remove those counting cycles in gistbulkdelete() in main vacuum patch (for v12). Both functions will have less code.

So, I agree, unconditional counting is a good idea. Here's the v3 patch.

Best regards, Andrey Borodin.

Attachments:

0001-Fix-GiST-stats-for-partial-indexes-v3.patchapplication/octet-stream; name=0001-Fix-GiST-stats-for-partial-indexes-v3.patch; x-unix-mode=0644Download
From 1cd93bc54d6e332eb0c9a3dba40bb7d07b91e632 Mon Sep 17 00:00:00 2001
From: Andrey Borodin <amborodin@acm.org>
Date: Fri, 2 Mar 2018 10:21:41 +0500
Subject: [PATCH] Fix GiST stats for partial indexes v3

---
 src/backend/access/gist/gistvacuum.c | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index 22181c6299..b776c8ea73 100644
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@@ -16,6 +16,7 @@
 
 #include "access/genam.h"
 #include "access/gist_private.h"
+#include "access/htup_details.h"
 #include "commands/vacuum.h"
 #include "miscadmin.h"
 #include "storage/indexfsm.h"
@@ -33,6 +34,7 @@ gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 				blkno;
 	BlockNumber totFreePages;
 	bool		needLock;
+	uint64		tuplesCount = 0;
 
 	/* No-op in ANALYZE ONLY mode */
 	if (info->analyze_only)
@@ -42,14 +44,6 @@ gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 	if (stats == NULL)
 	{
 		stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
-		/* use heap's tuple count */
-		stats->num_index_tuples = info->num_heap_tuples;
-		stats->estimated_count = info->estimated_count;
-
-		/*
-		 * XXX the above is wrong if index is partial.  Would it be OK to just
-		 * return NULL, or is there work we must do below?
-		 */
 	}
 
 	/*
@@ -82,12 +76,24 @@ gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 			totFreePages++;
 			RecordFreeIndexPage(rel, blkno);
 		}
+		else
+		{
+			/* Count tuples in leaf pages if needed */
+			if (GistPageIsLeaf(page))
+			{
+				tuplesCount += PageGetMaxOffsetNumber(page);
+			}
+		}
 		UnlockReleaseBuffer(buffer);
 	}
 
 	/* Finally, vacuum the FSM */
 	IndexFreeSpaceMapVacuum(info->index);
 
+	/* Update index tuples stat to counted over leaf pages if needed */
+	stats->num_index_tuples = tuplesCount;
+	stats->estimated_count = false;
+
 	/* return statistics */
 	stats->pages_free = totFreePages;
 	if (needLock)
-- 
2.14.3 (Apple Git-98)

#22Tom Lane
tgl@sss.pgh.pa.us
In reply to: Andrey Borodin (#21)
Re: New gist vacuum.

Andrey Borodin <x4mmm@yandex-team.ru> writes:

So, I agree, unconditional counting is a good idea. Here's the v3 patch.

Pushed with trivial cosmetic adjustments.

I've marked the CF entry as committed; please make a new CF entry in
2018-09 for the other patch. I'd also suggest starting a new email
thread for that. Linking the CF entry to a years-old thread doesn't
make it easy for people to find what's the current submission.

regards, tom lane

#23Andrey Borodin
x4mmm@yandex-team.ru
In reply to: Tom Lane (#22)
Re: New gist vacuum.

2 марта 2018 г., в 21:25, Tom Lane <tgl@sss.pgh.pa.us> написал(а):

Andrey Borodin <x4mmm@yandex-team.ru> writes:

So, I agree, unconditional counting is a good idea. Here's the v3 patch.

Pushed with trivial cosmetic adjustments.

I've marked the CF entry as committed; please make a new CF entry in
2018-09 for the other patch. I'd also suggest starting a new email
thread for that. Linking the CF entry to a years-old thread doesn't
make it easy for people to find what's the current submission.

Thanks, Tom!

Yes, I'll definitely start new thread for that patch. This thread had split unexpectedly, and I see it's not a convenient way. I've learned no to do it this way anymore :)

Best regards, Andrey Borodin.