Write visibility map during CLUSTER/VACUUM FULL

Started by Alexander Korotkovover 6 years ago15 messages
#1Alexander Korotkov
a.korotkov@postgrespro.ru
1 attachment(s)

Hi!

I found it weird that CLUSTER/VACUUM FULL don't write visibility map.
Attached patch implements writing visibility map in
heapam_relation_copy_for_cluster().

I've studied previous attempt to implement this [1]. The main problem
of that attempt was usage of existing heap_page_is_all_visible() and
visibilitymap_set() functions. These functions works through buffer
manager, while heap rewriting is made bypass buffer manager.

In my patch visibility map pages are handled in the same way as heap
pages are. RewriteStateData holds contents of single VM page. Once
heap page is finished, corresponding bits are set to VM page etc. VM
pages are flushed one-by-one to smgr.

This patch have to implement its own check if tuple is allvisible.
But it appears to be possible to simplify this check assuming that all
tuples already past HeapTupleSatisfiesVacuum(), which sets hint bits.

Links
1. /messages/by-id/20131203162556.GC27105@momjian.us

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

Attachments:

0001-write-vm-during-cluster-1.patchapplication/octet-stream; name=0001-write-vm-during-cluster-1.patchDownload
commit ba2422e48584dc745efa9ffeb28776eaed10b0d7
Author: Alexander Korotkov <akorotkov@postgresql.org>
Date:   Sun Sep 1 10:36:59 2019 +0300

    Write visibility map during CLUSTER/VACUUM FULL
    
    Reported-by:
    Bug:
    Discussion:
    Author:
    Reviewed-by:
    Tested-by:
    Backpatch-through:

diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index a17508a82fb..602f52e4e2a 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -112,6 +112,7 @@
 #include "access/rewriteheap.h"
 #include "access/transam.h"
 #include "access/tuptoaster.h"
+#include "access/visibilitymap.h"
 #include "access/xact.h"
 #include "access/xloginsert.h"
 
@@ -161,6 +162,11 @@ typedef struct RewriteStateData
 	HTAB	   *rs_old_new_tid_map; /* unmatched B tuples */
 	HTAB	   *rs_logical_mappings;	/* logical remapping files */
 	uint32		rs_num_rewrite_mappings;	/* # in memory mappings */
+	Page		rs_vm_buffer;	/* visibility map page */
+	BlockNumber rs_vm_blockno;	/* block number of visibility page */
+	BlockNumber rs_vm_buffer_valid;	/* T if any bits are set */
+	bool		rs_all_visible;	/* all visible flag for rs_buffer */
+	bool		rs_all_frozen;	/* all frozen flag for rs_buffer */
 }			RewriteStateData;
 
 /*
@@ -222,6 +228,9 @@ typedef struct RewriteMappingDataEntry
 
 
 /* prototypes for internal functions */
+static void rewrite_flush_vm_page(RewriteState state);
+static void rewrite_set_vm_flags(RewriteState state);
+static void rewrite_update_vm_flags(RewriteState state, HeapTuple tuple);
 static void raw_heap_insert(RewriteState state, HeapTuple tup);
 
 /* internal logical remapping prototypes */
@@ -276,6 +285,11 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm
 	state->rs_freeze_xid = freeze_xid;
 	state->rs_cutoff_multi = cutoff_multi;
 	state->rs_cxt = rw_cxt;
+	state->rs_vm_buffer = (Page) palloc(BLCKSZ);
+	state->rs_vm_blockno = HEAPBLK_TO_MAPBLOCK(state->rs_blockno);
+	state->rs_vm_buffer_valid = false;
+	state->rs_all_visible = true;
+	state->rs_all_frozen = true;
 
 	/* Initialize hash tables used to track update chains */
 	memset(&hash_ctl, 0, sizeof(hash_ctl));
@@ -297,6 +311,10 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm
 					&hash_ctl,
 					HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
 
+	RelationOpenSmgr(state->rs_new_rel);
+	if (!smgrexists(state->rs_new_rel->rd_smgr, VISIBILITYMAP_FORKNUM))
+		smgrcreate(state->rs_new_rel->rd_smgr, VISIBILITYMAP_FORKNUM, false);
+
 	MemoryContextSwitchTo(old_cxt);
 
 	logical_begin_heap_rewrite(state);
@@ -330,6 +348,9 @@ end_heap_rewrite(RewriteState state)
 	/* Write the last page, if any */
 	if (state->rs_buffer_valid)
 	{
+		if (state->rs_all_visible)
+			PageSetAllVisible(state->rs_buffer);
+
 		if (state->rs_use_wal)
 			log_newpage(&state->rs_new_rel->rd_node,
 						MAIN_FORKNUM,
@@ -342,8 +363,14 @@ end_heap_rewrite(RewriteState state)
 
 		smgrextend(state->rs_new_rel->rd_smgr, MAIN_FORKNUM, state->rs_blockno,
 				   (char *) state->rs_buffer, true);
+
+		rewrite_set_vm_flags(state);
 	}
 
+	/* Write the last VM page too */
+	if (state->rs_vm_buffer_valid)
+		rewrite_flush_vm_page(state);
+
 	/*
 	 * If the rel is WAL-logged, must fsync before commit.  We use heap_sync
 	 * to ensure that the toast table gets fsync'd too.
@@ -364,6 +391,88 @@ end_heap_rewrite(RewriteState state)
 	MemoryContextDelete(state->rs_cxt);
 }
 
+/* Write contents of VM page */
+static void
+rewrite_flush_vm_page(RewriteState state)
+{
+	Assert(state->rs_vm_buffer_valid);
+
+	if (state->rs_use_wal)
+		log_newpage(&state->rs_new_rel->rd_node,
+					VISIBILITYMAP_FORKNUM,
+					state->rs_vm_blockno,
+					state->rs_vm_buffer,
+					true);
+	RelationOpenSmgr(state->rs_new_rel);
+
+	PageSetChecksumInplace(state->rs_vm_buffer, state->rs_vm_blockno);
+
+	smgrextend(state->rs_new_rel->rd_smgr, VISIBILITYMAP_FORKNUM,
+			   state->rs_vm_blockno, (char *) state->rs_vm_buffer, true);
+
+	state->rs_vm_buffer_valid = false;
+}
+
+/* Set VM flags to the VM page */
+static void
+rewrite_set_vm_flags(RewriteState state)
+{
+	BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(state->rs_blockno);
+	uint32		mapByte = HEAPBLK_TO_MAPBYTE(state->rs_blockno);
+	uint8		mapOffset = HEAPBLK_TO_OFFSET(state->rs_blockno);
+	char	   *map;
+	uint8		flags;
+
+	if (mapBlock != state->rs_vm_blockno && state->rs_vm_buffer_valid)
+		rewrite_flush_vm_page(state);
+
+	if (!state->rs_vm_buffer_valid)
+	{
+		PageInit(state->rs_vm_buffer, BLCKSZ, 0);
+		state->rs_vm_blockno = mapBlock;
+		state->rs_vm_buffer_valid = true;
+	}
+
+	flags = (state->rs_all_visible ? VISIBILITYMAP_ALL_VISIBLE : 0) |
+			(state->rs_all_frozen ? VISIBILITYMAP_ALL_FROZEN : 0);
+
+	map = PageGetContents(state->rs_vm_buffer);
+	map[mapByte] |= (flags << mapOffset);
+}
+
+/*
+ * Update rs_all_visible and rs_all_frozen flags according to the tuple.  We
+ * use simplified check assuming that HeapTupleSatisfiesVacuum() should already
+ * set tuple hint bits.
+ */
+static void
+rewrite_update_vm_flags(RewriteState state, HeapTuple tuple)
+{
+	if (!state->rs_all_visible)
+		return;
+
+	if (!HeapTupleHeaderXminCommitted(tuple->t_data))
+	{
+		state->rs_all_visible = false;
+		state->rs_all_frozen = false;
+		return;
+	}
+
+	if (!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID) &&
+		!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask))
+	{
+		state->rs_all_visible = false;
+		state->rs_all_frozen = false;
+		return;
+	}
+
+	if (!state->rs_all_frozen)
+		return;
+
+	if (heap_tuple_needs_eventual_freeze(tuple->t_data))
+		state->rs_all_frozen = false;
+}
+
 /*
  * Add a tuple to the new heap.
  *
@@ -490,6 +599,7 @@ rewrite_heap_tuple(RewriteState state,
 
 		/* Insert the tuple and find out where it's put in new_heap */
 		raw_heap_insert(state, new_tuple);
+		rewrite_update_vm_flags(state, new_tuple);
 		new_tid = new_tuple->t_self;
 
 		logical_rewrite_heap_tuple(state, old_tid, new_tuple);
@@ -694,6 +804,9 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
 		{
 			/* Doesn't fit, so write out the existing page */
 
+			if (state->rs_all_visible)
+				PageSetAllVisible(page);
+
 			/* XLOG stuff */
 			if (state->rs_use_wal)
 				log_newpage(&state->rs_new_rel->rd_node,
@@ -715,6 +828,8 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
 			smgrextend(state->rs_new_rel->rd_smgr, MAIN_FORKNUM,
 					   state->rs_blockno, (char *) page, true);
 
+			rewrite_set_vm_flags(state);
+
 			state->rs_blockno++;
 			state->rs_buffer_valid = false;
 		}
diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
index a08922b0798..657cac31457 100644
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -98,24 +98,6 @@
 
 /*#define TRACE_VISIBILITYMAP */
 
-/*
- * Size of the bitmap on each visibility map page, in bytes. There's no
- * extra headers, so the whole page minus the standard page header is
- * used for the bitmap.
- */
-#define MAPSIZE (BLCKSZ - MAXALIGN(SizeOfPageHeaderData))
-
-/* Number of heap blocks we can represent in one byte */
-#define HEAPBLOCKS_PER_BYTE (BITS_PER_BYTE / BITS_PER_HEAPBLOCK)
-
-/* Number of heap blocks we can represent in one visibility map page. */
-#define HEAPBLOCKS_PER_PAGE (MAPSIZE * HEAPBLOCKS_PER_BYTE)
-
-/* Mapping from heap block number to the right bit in the visibility map */
-#define HEAPBLK_TO_MAPBLOCK(x) ((x) / HEAPBLOCKS_PER_PAGE)
-#define HEAPBLK_TO_MAPBYTE(x) (((x) % HEAPBLOCKS_PER_PAGE) / HEAPBLOCKS_PER_BYTE)
-#define HEAPBLK_TO_OFFSET(x) (((x) % HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK)
-
 /* Masks for counting subsets of bits in the visibility map. */
 #define VISIBLE_MASK64	UINT64CONST(0x5555555555555555) /* The lower bit of each
 														 * bit pair */
diff --git a/src/include/access/visibilitymap.h b/src/include/access/visibilitymap.h
index 2d8804351ac..524abb4f2ca 100644
--- a/src/include/access/visibilitymap.h
+++ b/src/include/access/visibilitymap.h
@@ -34,6 +34,24 @@
 #define VM_ALL_FROZEN(r, b, v) \
 	((visibilitymap_get_status((r), (b), (v)) & VISIBILITYMAP_ALL_FROZEN) != 0)
 
+/*
+ * Size of the bitmap on each visibility map page, in bytes. There's no
+ * extra headers, so the whole page minus the standard page header is
+ * used for the bitmap.
+ */
+#define MAPSIZE (BLCKSZ - MAXALIGN(SizeOfPageHeaderData))
+
+/* Number of heap blocks we can represent in one byte */
+#define HEAPBLOCKS_PER_BYTE (BITS_PER_BYTE / BITS_PER_HEAPBLOCK)
+
+/* Number of heap blocks we can represent in one visibility map page. */
+#define HEAPBLOCKS_PER_PAGE (MAPSIZE * HEAPBLOCKS_PER_BYTE)
+
+/* Mapping from heap block number to the right bit in the visibility map */
+#define HEAPBLK_TO_MAPBLOCK(x) ((x) / HEAPBLOCKS_PER_PAGE)
+#define HEAPBLK_TO_MAPBYTE(x) (((x) % HEAPBLOCKS_PER_PAGE) / HEAPBLOCKS_PER_BYTE)
+#define HEAPBLK_TO_OFFSET(x) (((x) % HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK)
+
 extern bool visibilitymap_clear(Relation rel, BlockNumber heapBlk,
 								Buffer vmbuf, uint8 flags);
 extern void visibilitymap_pin(Relation rel, BlockNumber heapBlk,
#2Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Alexander Korotkov (#1)
1 attachment(s)
Re: Write visibility map during CLUSTER/VACUUM FULL

On Sun, Sep 1, 2019 at 11:07 AM Alexander Korotkov
<a.korotkov@postgrespro.ru> wrote:

This patch have to implement its own check if tuple is allvisible.
But it appears to be possible to simplify this check assuming that all
tuples already past HeapTupleSatisfiesVacuum(), which sets hint bits.

Forgot to check tuple xmin against oldest xmin. Fixed.

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

Attachments:

0001-write-vm-during-cluster-2.patchapplication/octet-stream; name=0001-write-vm-during-cluster-2.patchDownload
commit 4bde4bb23646718a1813cb0fb66b4b8f78f8c894
Author: Alexander Korotkov <akorotkov@postgresql.org>
Date:   Sun Sep 1 10:36:59 2019 +0300

    Write visibility map during CLUSTER/VACUUM FULL
    
    Reported-by:
    Bug:
    Discussion:
    Author:
    Reviewed-by:
    Tested-by:
    Backpatch-through:

diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index a17508a82fb..98ff4afc583 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -112,6 +112,7 @@
 #include "access/rewriteheap.h"
 #include "access/transam.h"
 #include "access/tuptoaster.h"
+#include "access/visibilitymap.h"
 #include "access/xact.h"
 #include "access/xloginsert.h"
 
@@ -161,6 +162,11 @@ typedef struct RewriteStateData
 	HTAB	   *rs_old_new_tid_map; /* unmatched B tuples */
 	HTAB	   *rs_logical_mappings;	/* logical remapping files */
 	uint32		rs_num_rewrite_mappings;	/* # in memory mappings */
+	Page		rs_vm_buffer;	/* visibility map page */
+	BlockNumber rs_vm_blockno;	/* block number of visibility page */
+	BlockNumber rs_vm_buffer_valid;	/* T if any bits are set */
+	bool		rs_all_visible;	/* all visible flag for rs_buffer */
+	bool		rs_all_frozen;	/* all frozen flag for rs_buffer */
 }			RewriteStateData;
 
 /*
@@ -222,6 +228,9 @@ typedef struct RewriteMappingDataEntry
 
 
 /* prototypes for internal functions */
+static void rewrite_flush_vm_page(RewriteState state);
+static void rewrite_set_vm_flags(RewriteState state);
+static void rewrite_update_vm_flags(RewriteState state, HeapTuple tuple);
 static void raw_heap_insert(RewriteState state, HeapTuple tup);
 
 /* internal logical remapping prototypes */
@@ -276,6 +285,11 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm
 	state->rs_freeze_xid = freeze_xid;
 	state->rs_cutoff_multi = cutoff_multi;
 	state->rs_cxt = rw_cxt;
+	state->rs_vm_buffer = (Page) palloc(BLCKSZ);
+	state->rs_vm_blockno = HEAPBLK_TO_MAPBLOCK(state->rs_blockno);
+	state->rs_vm_buffer_valid = false;
+	state->rs_all_visible = true;
+	state->rs_all_frozen = true;
 
 	/* Initialize hash tables used to track update chains */
 	memset(&hash_ctl, 0, sizeof(hash_ctl));
@@ -297,6 +311,10 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm
 					&hash_ctl,
 					HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
 
+	RelationOpenSmgr(state->rs_new_rel);
+	if (!smgrexists(state->rs_new_rel->rd_smgr, VISIBILITYMAP_FORKNUM))
+		smgrcreate(state->rs_new_rel->rd_smgr, VISIBILITYMAP_FORKNUM, false);
+
 	MemoryContextSwitchTo(old_cxt);
 
 	logical_begin_heap_rewrite(state);
@@ -330,6 +348,9 @@ end_heap_rewrite(RewriteState state)
 	/* Write the last page, if any */
 	if (state->rs_buffer_valid)
 	{
+		if (state->rs_all_visible)
+			PageSetAllVisible(state->rs_buffer);
+
 		if (state->rs_use_wal)
 			log_newpage(&state->rs_new_rel->rd_node,
 						MAIN_FORKNUM,
@@ -342,8 +363,14 @@ end_heap_rewrite(RewriteState state)
 
 		smgrextend(state->rs_new_rel->rd_smgr, MAIN_FORKNUM, state->rs_blockno,
 				   (char *) state->rs_buffer, true);
+
+		rewrite_set_vm_flags(state);
 	}
 
+	/* Write the last VM page too */
+	if (state->rs_vm_buffer_valid)
+		rewrite_flush_vm_page(state);
+
 	/*
 	 * If the rel is WAL-logged, must fsync before commit.  We use heap_sync
 	 * to ensure that the toast table gets fsync'd too.
@@ -364,6 +391,98 @@ end_heap_rewrite(RewriteState state)
 	MemoryContextDelete(state->rs_cxt);
 }
 
+/* Write contents of VM page */
+static void
+rewrite_flush_vm_page(RewriteState state)
+{
+	Assert(state->rs_vm_buffer_valid);
+
+	if (state->rs_use_wal)
+		log_newpage(&state->rs_new_rel->rd_node,
+					VISIBILITYMAP_FORKNUM,
+					state->rs_vm_blockno,
+					state->rs_vm_buffer,
+					true);
+	RelationOpenSmgr(state->rs_new_rel);
+
+	PageSetChecksumInplace(state->rs_vm_buffer, state->rs_vm_blockno);
+
+	smgrextend(state->rs_new_rel->rd_smgr, VISIBILITYMAP_FORKNUM,
+			   state->rs_vm_blockno, (char *) state->rs_vm_buffer, true);
+
+	state->rs_vm_buffer_valid = false;
+}
+
+/* Set VM flags to the VM page */
+static void
+rewrite_set_vm_flags(RewriteState state)
+{
+	BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(state->rs_blockno);
+	uint32		mapByte = HEAPBLK_TO_MAPBYTE(state->rs_blockno);
+	uint8		mapOffset = HEAPBLK_TO_OFFSET(state->rs_blockno);
+	char	   *map;
+	uint8		flags;
+
+	if (mapBlock != state->rs_vm_blockno && state->rs_vm_buffer_valid)
+		rewrite_flush_vm_page(state);
+
+	if (!state->rs_vm_buffer_valid)
+	{
+		PageInit(state->rs_vm_buffer, BLCKSZ, 0);
+		state->rs_vm_blockno = mapBlock;
+		state->rs_vm_buffer_valid = true;
+	}
+
+	flags = (state->rs_all_visible ? VISIBILITYMAP_ALL_VISIBLE : 0) |
+			(state->rs_all_frozen ? VISIBILITYMAP_ALL_FROZEN : 0);
+
+	map = PageGetContents(state->rs_vm_buffer);
+	map[mapByte] |= (flags << mapOffset);
+}
+
+/*
+ * Update rs_all_visible and rs_all_frozen flags according to the tuple.  We
+ * use simplified check assuming that HeapTupleSatisfiesVacuum() should already
+ * set tuple hint bits.
+ */
+static void
+rewrite_update_vm_flags(RewriteState state, HeapTuple tuple)
+{
+	TransactionId	xmin;
+
+	if (!state->rs_all_visible)
+		return;
+
+	if (!HeapTupleHeaderXminCommitted(tuple->t_data))
+	{
+		state->rs_all_visible = false;
+		state->rs_all_frozen = false;
+		return;
+	}
+
+	xmin = HeapTupleHeaderGetXmin(tuple->t_data);
+	if (!TransactionIdPrecedes(xmin, state->rs_oldest_xmin))
+	{
+		state->rs_all_visible = false;
+		state->rs_all_frozen = false;
+		return;
+	}
+
+	if (!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID) &&
+		!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask))
+	{
+		state->rs_all_visible = false;
+		state->rs_all_frozen = false;
+		return;
+	}
+
+	if (!state->rs_all_frozen)
+		return;
+
+	if (heap_tuple_needs_eventual_freeze(tuple->t_data))
+		state->rs_all_frozen = false;
+}
+
 /*
  * Add a tuple to the new heap.
  *
@@ -490,6 +609,7 @@ rewrite_heap_tuple(RewriteState state,
 
 		/* Insert the tuple and find out where it's put in new_heap */
 		raw_heap_insert(state, new_tuple);
+		rewrite_update_vm_flags(state, new_tuple);
 		new_tid = new_tuple->t_self;
 
 		logical_rewrite_heap_tuple(state, old_tid, new_tuple);
@@ -694,6 +814,9 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
 		{
 			/* Doesn't fit, so write out the existing page */
 
+			if (state->rs_all_visible)
+				PageSetAllVisible(page);
+
 			/* XLOG stuff */
 			if (state->rs_use_wal)
 				log_newpage(&state->rs_new_rel->rd_node,
@@ -715,6 +838,8 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
 			smgrextend(state->rs_new_rel->rd_smgr, MAIN_FORKNUM,
 					   state->rs_blockno, (char *) page, true);
 
+			rewrite_set_vm_flags(state);
+
 			state->rs_blockno++;
 			state->rs_buffer_valid = false;
 		}
diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
index a08922b0798..657cac31457 100644
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -98,24 +98,6 @@
 
 /*#define TRACE_VISIBILITYMAP */
 
-/*
- * Size of the bitmap on each visibility map page, in bytes. There's no
- * extra headers, so the whole page minus the standard page header is
- * used for the bitmap.
- */
-#define MAPSIZE (BLCKSZ - MAXALIGN(SizeOfPageHeaderData))
-
-/* Number of heap blocks we can represent in one byte */
-#define HEAPBLOCKS_PER_BYTE (BITS_PER_BYTE / BITS_PER_HEAPBLOCK)
-
-/* Number of heap blocks we can represent in one visibility map page. */
-#define HEAPBLOCKS_PER_PAGE (MAPSIZE * HEAPBLOCKS_PER_BYTE)
-
-/* Mapping from heap block number to the right bit in the visibility map */
-#define HEAPBLK_TO_MAPBLOCK(x) ((x) / HEAPBLOCKS_PER_PAGE)
-#define HEAPBLK_TO_MAPBYTE(x) (((x) % HEAPBLOCKS_PER_PAGE) / HEAPBLOCKS_PER_BYTE)
-#define HEAPBLK_TO_OFFSET(x) (((x) % HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK)
-
 /* Masks for counting subsets of bits in the visibility map. */
 #define VISIBLE_MASK64	UINT64CONST(0x5555555555555555) /* The lower bit of each
 														 * bit pair */
diff --git a/src/include/access/visibilitymap.h b/src/include/access/visibilitymap.h
index 2d8804351ac..524abb4f2ca 100644
--- a/src/include/access/visibilitymap.h
+++ b/src/include/access/visibilitymap.h
@@ -34,6 +34,24 @@
 #define VM_ALL_FROZEN(r, b, v) \
 	((visibilitymap_get_status((r), (b), (v)) & VISIBILITYMAP_ALL_FROZEN) != 0)
 
+/*
+ * Size of the bitmap on each visibility map page, in bytes. There's no
+ * extra headers, so the whole page minus the standard page header is
+ * used for the bitmap.
+ */
+#define MAPSIZE (BLCKSZ - MAXALIGN(SizeOfPageHeaderData))
+
+/* Number of heap blocks we can represent in one byte */
+#define HEAPBLOCKS_PER_BYTE (BITS_PER_BYTE / BITS_PER_HEAPBLOCK)
+
+/* Number of heap blocks we can represent in one visibility map page. */
+#define HEAPBLOCKS_PER_PAGE (MAPSIZE * HEAPBLOCKS_PER_BYTE)
+
+/* Mapping from heap block number to the right bit in the visibility map */
+#define HEAPBLK_TO_MAPBLOCK(x) ((x) / HEAPBLOCKS_PER_PAGE)
+#define HEAPBLK_TO_MAPBYTE(x) (((x) % HEAPBLOCKS_PER_PAGE) / HEAPBLOCKS_PER_BYTE)
+#define HEAPBLK_TO_OFFSET(x) (((x) % HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK)
+
 extern bool visibilitymap_clear(Relation rel, BlockNumber heapBlk,
 								Buffer vmbuf, uint8 flags);
 extern void visibilitymap_pin(Relation rel, BlockNumber heapBlk,
#3Amit Kapila
amit.kapila16@gmail.com
In reply to: Alexander Korotkov (#1)
Re: Write visibility map during CLUSTER/VACUUM FULL

On Sun, Sep 1, 2019 at 1:37 PM Alexander Korotkov
<a.korotkov@postgrespro.ru> wrote:

Hi!

I found it weird that CLUSTER/VACUUM FULL don't write visibility map.
Attached patch implements writing visibility map in
heapam_relation_copy_for_cluster().

I've studied previous attempt to implement this [1]. The main problem
of that attempt was usage of existing heap_page_is_all_visible() and
visibilitymap_set() functions. These functions works through buffer
manager, while heap rewriting is made bypass buffer manager.

In my patch visibility map pages are handled in the same way as heap
pages are.

I haven't studied this patch in detail, but while glancing I observed
that this doesn't try to sync the vm pages as we do for heap pages in
the end (during end_heap_rewrite). Am I missing something?

--
With Regards,
Amit Kapila.
EnterpriseDB: http://www.enterprisedb.com

#4Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Amit Kapila (#3)
Re: Write visibility map during CLUSTER/VACUUM FULL

On Wed, Sep 11, 2019 at 3:30 PM Amit Kapila <amit.kapila16@gmail.com> wrote:

On Sun, Sep 1, 2019 at 1:37 PM Alexander Korotkov
<a.korotkov@postgrespro.ru> wrote:

I found it weird that CLUSTER/VACUUM FULL don't write visibility map.
Attached patch implements writing visibility map in
heapam_relation_copy_for_cluster().

I've studied previous attempt to implement this [1]. The main problem
of that attempt was usage of existing heap_page_is_all_visible() and
visibilitymap_set() functions. These functions works through buffer
manager, while heap rewriting is made bypass buffer manager.

In my patch visibility map pages are handled in the same way as heap
pages are.

I haven't studied this patch in detail, but while glancing I observed
that this doesn't try to sync the vm pages as we do for heap pages in
the end (during end_heap_rewrite). Am I missing something?

You're not missed anything. Yes, VM need sync. Will fix this. And I
just noticed I need a closer look to what is going on with TOAST.

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

#5Alexander Korotkov
a.korotkov@postgrespro.ru
In reply to: Alexander Korotkov (#4)
1 attachment(s)
Re: Write visibility map during CLUSTER/VACUUM FULL

On Thu, Sep 12, 2019 at 4:55 PM Alexander Korotkov
<a.korotkov@postgrespro.ru> wrote:

On Wed, Sep 11, 2019 at 3:30 PM Amit Kapila <amit.kapila16@gmail.com> wrote:

On Sun, Sep 1, 2019 at 1:37 PM Alexander Korotkov
<a.korotkov@postgrespro.ru> wrote:

I found it weird that CLUSTER/VACUUM FULL don't write visibility map.
Attached patch implements writing visibility map in
heapam_relation_copy_for_cluster().

I've studied previous attempt to implement this [1]. The main problem
of that attempt was usage of existing heap_page_is_all_visible() and
visibilitymap_set() functions. These functions works through buffer
manager, while heap rewriting is made bypass buffer manager.

In my patch visibility map pages are handled in the same way as heap
pages are.

I haven't studied this patch in detail, but while glancing I observed
that this doesn't try to sync the vm pages as we do for heap pages in
the end (during end_heap_rewrite). Am I missing something?

You're not missed anything. Yes, VM need sync. Will fix this. And I
just noticed I need a closer look to what is going on with TOAST.

Attached patch syncs VM during end_heap_rewrite().

However, VM for TOAST still isn't read. It appear to be much more
difficult to write VM for TOAST, because it's written by insertion
tuples one-by-one. Despite it seems to fill TOAST heap pages
sequentially (assuming no FSM exists yet), it's quite hard to handle
page-switching event with reasonable level of abstraction.
Nevertheless, I find this patch useful in current shape. Even if we
don't write VM for TOAST, it's still useful to do for regular heap.
Additionally, one of key advantages of having VM is index-only scan,
which don't work for TOAST anyway.

------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

Attachments:

0001-write-vm-during-cluster-3.patchapplication/octet-stream; name=0001-write-vm-during-cluster-3.patchDownload
commit 48edc162d325e9240b7fdc8db472999e67c4a431
Author: Alexander Korotkov <akorotkov@postgresql.org>
Date:   Sun Sep 1 10:36:59 2019 +0300

    Write visibility map during CLUSTER/VACUUM FULL
    
    Reported-by:
    Bug:
    Discussion:
    Author:
    Reviewed-by:
    Tested-by:
    Backpatch-through:

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index e9544822bf9..4446c6de9ba 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -8935,7 +8935,7 @@ heap2_redo(XLogReaderState *record)
  * to be done here.)
  */
 void
-heap_sync(Relation rel)
+heap_sync(Relation rel, bool sync_vm)
 {
 	/* non-WAL-logged tables never need fsync */
 	if (!RelationNeedsWAL(rel))
@@ -8945,6 +8945,8 @@ heap_sync(Relation rel)
 	FlushRelationBuffers(rel);
 	/* FlushRelationBuffers will have opened rd_smgr */
 	smgrimmedsync(rel->rd_smgr, MAIN_FORKNUM);
+	if (sync_vm)
+		smgrimmedsync(rel->rd_smgr, VISIBILITYMAP_FORKNUM);
 
 	/* FSM is not critical, don't bother syncing it */
 
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index 2dd8821facd..0429919ec86 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -566,7 +566,7 @@ heapam_finish_bulk_insert(Relation relation, int options)
 	 * indexes since those use WAL anyway / don't go through tableam)
 	 */
 	if (options & HEAP_INSERT_SKIP_WAL)
-		heap_sync(relation);
+		heap_sync(relation, false);
 }
 
 
diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index 0172a139576..45a85307e90 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -112,6 +112,7 @@
 #include "access/heaptoast.h"
 #include "access/rewriteheap.h"
 #include "access/transam.h"
+#include "access/visibilitymap.h"
 #include "access/xact.h"
 #include "access/xloginsert.h"
 
@@ -161,6 +162,11 @@ typedef struct RewriteStateData
 	HTAB	   *rs_old_new_tid_map; /* unmatched B tuples */
 	HTAB	   *rs_logical_mappings;	/* logical remapping files */
 	uint32		rs_num_rewrite_mappings;	/* # in memory mappings */
+	Page		rs_vm_buffer;	/* visibility map page */
+	BlockNumber rs_vm_blockno;	/* block number of visibility page */
+	BlockNumber rs_vm_buffer_valid;	/* T if any bits are set */
+	bool		rs_all_visible;	/* all visible flag for rs_buffer */
+	bool		rs_all_frozen;	/* all frozen flag for rs_buffer */
 }			RewriteStateData;
 
 /*
@@ -222,6 +228,9 @@ typedef struct RewriteMappingDataEntry
 
 
 /* prototypes for internal functions */
+static void rewrite_flush_vm_page(RewriteState state);
+static void rewrite_set_vm_flags(RewriteState state);
+static void rewrite_update_vm_flags(RewriteState state, HeapTuple tuple);
 static void raw_heap_insert(RewriteState state, HeapTuple tup);
 
 /* internal logical remapping prototypes */
@@ -276,6 +285,11 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm
 	state->rs_freeze_xid = freeze_xid;
 	state->rs_cutoff_multi = cutoff_multi;
 	state->rs_cxt = rw_cxt;
+	state->rs_vm_buffer = (Page) palloc(BLCKSZ);
+	state->rs_vm_blockno = HEAPBLK_TO_MAPBLOCK(state->rs_blockno);
+	state->rs_vm_buffer_valid = false;
+	state->rs_all_visible = true;
+	state->rs_all_frozen = true;
 
 	/* Initialize hash tables used to track update chains */
 	memset(&hash_ctl, 0, sizeof(hash_ctl));
@@ -297,6 +311,10 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm
 					&hash_ctl,
 					HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
 
+	RelationOpenSmgr(state->rs_new_rel);
+	if (!smgrexists(state->rs_new_rel->rd_smgr, VISIBILITYMAP_FORKNUM))
+		smgrcreate(state->rs_new_rel->rd_smgr, VISIBILITYMAP_FORKNUM, false);
+
 	MemoryContextSwitchTo(old_cxt);
 
 	logical_begin_heap_rewrite(state);
@@ -330,6 +348,9 @@ end_heap_rewrite(RewriteState state)
 	/* Write the last page, if any */
 	if (state->rs_buffer_valid)
 	{
+		if (state->rs_all_visible)
+			PageSetAllVisible(state->rs_buffer);
+
 		if (state->rs_use_wal)
 			log_newpage(&state->rs_new_rel->rd_node,
 						MAIN_FORKNUM,
@@ -342,8 +363,14 @@ end_heap_rewrite(RewriteState state)
 
 		smgrextend(state->rs_new_rel->rd_smgr, MAIN_FORKNUM, state->rs_blockno,
 				   (char *) state->rs_buffer, true);
+
+		rewrite_set_vm_flags(state);
 	}
 
+	/* Write the last VM page too */
+	if (state->rs_vm_buffer_valid)
+		rewrite_flush_vm_page(state);
+
 	/*
 	 * If the rel is WAL-logged, must fsync before commit.  We use heap_sync
 	 * to ensure that the toast table gets fsync'd too.
@@ -356,7 +383,7 @@ end_heap_rewrite(RewriteState state)
 	 * wrote before the checkpoint.
 	 */
 	if (RelationNeedsWAL(state->rs_new_rel))
-		heap_sync(state->rs_new_rel);
+		heap_sync(state->rs_new_rel, true);
 
 	logical_end_heap_rewrite(state);
 
@@ -364,6 +391,98 @@ end_heap_rewrite(RewriteState state)
 	MemoryContextDelete(state->rs_cxt);
 }
 
+/* Write contents of VM page */
+static void
+rewrite_flush_vm_page(RewriteState state)
+{
+	Assert(state->rs_vm_buffer_valid);
+
+	if (state->rs_use_wal)
+		log_newpage(&state->rs_new_rel->rd_node,
+					VISIBILITYMAP_FORKNUM,
+					state->rs_vm_blockno,
+					state->rs_vm_buffer,
+					true);
+	RelationOpenSmgr(state->rs_new_rel);
+
+	PageSetChecksumInplace(state->rs_vm_buffer, state->rs_vm_blockno);
+
+	smgrextend(state->rs_new_rel->rd_smgr, VISIBILITYMAP_FORKNUM,
+			   state->rs_vm_blockno, (char *) state->rs_vm_buffer, true);
+
+	state->rs_vm_buffer_valid = false;
+}
+
+/* Set VM flags to the VM page */
+static void
+rewrite_set_vm_flags(RewriteState state)
+{
+	BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(state->rs_blockno);
+	uint32		mapByte = HEAPBLK_TO_MAPBYTE(state->rs_blockno);
+	uint8		mapOffset = HEAPBLK_TO_OFFSET(state->rs_blockno);
+	char	   *map;
+	uint8		flags;
+
+	if (mapBlock != state->rs_vm_blockno && state->rs_vm_buffer_valid)
+		rewrite_flush_vm_page(state);
+
+	if (!state->rs_vm_buffer_valid)
+	{
+		PageInit(state->rs_vm_buffer, BLCKSZ, 0);
+		state->rs_vm_blockno = mapBlock;
+		state->rs_vm_buffer_valid = true;
+	}
+
+	flags = (state->rs_all_visible ? VISIBILITYMAP_ALL_VISIBLE : 0) |
+			(state->rs_all_frozen ? VISIBILITYMAP_ALL_FROZEN : 0);
+
+	map = PageGetContents(state->rs_vm_buffer);
+	map[mapByte] |= (flags << mapOffset);
+}
+
+/*
+ * Update rs_all_visible and rs_all_frozen flags according to the tuple.  We
+ * use simplified check assuming that HeapTupleSatisfiesVacuum() should already
+ * set tuple hint bits.
+ */
+static void
+rewrite_update_vm_flags(RewriteState state, HeapTuple tuple)
+{
+	TransactionId	xmin;
+
+	if (!state->rs_all_visible)
+		return;
+
+	if (!HeapTupleHeaderXminCommitted(tuple->t_data))
+	{
+		state->rs_all_visible = false;
+		state->rs_all_frozen = false;
+		return;
+	}
+
+	xmin = HeapTupleHeaderGetXmin(tuple->t_data);
+	if (!TransactionIdPrecedes(xmin, state->rs_oldest_xmin))
+	{
+		state->rs_all_visible = false;
+		state->rs_all_frozen = false;
+		return;
+	}
+
+	if (!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID) &&
+		!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask))
+	{
+		state->rs_all_visible = false;
+		state->rs_all_frozen = false;
+		return;
+	}
+
+	if (!state->rs_all_frozen)
+		return;
+
+	if (heap_tuple_needs_eventual_freeze(tuple->t_data))
+		state->rs_all_frozen = false;
+}
+
 /*
  * Add a tuple to the new heap.
  *
@@ -490,6 +609,7 @@ rewrite_heap_tuple(RewriteState state,
 
 		/* Insert the tuple and find out where it's put in new_heap */
 		raw_heap_insert(state, new_tuple);
+		rewrite_update_vm_flags(state, new_tuple);
 		new_tid = new_tuple->t_self;
 
 		logical_rewrite_heap_tuple(state, old_tid, new_tuple);
@@ -694,6 +814,9 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
 		{
 			/* Doesn't fit, so write out the existing page */
 
+			if (state->rs_all_visible)
+				PageSetAllVisible(page);
+
 			/* XLOG stuff */
 			if (state->rs_use_wal)
 				log_newpage(&state->rs_new_rel->rd_node,
@@ -715,6 +838,8 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
 			smgrextend(state->rs_new_rel->rd_smgr, MAIN_FORKNUM,
 					   state->rs_blockno, (char *) page, true);
 
+			rewrite_set_vm_flags(state);
+
 			state->rs_blockno++;
 			state->rs_buffer_valid = false;
 		}
diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
index a08922b0798..657cac31457 100644
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -98,24 +98,6 @@
 
 /*#define TRACE_VISIBILITYMAP */
 
-/*
- * Size of the bitmap on each visibility map page, in bytes. There's no
- * extra headers, so the whole page minus the standard page header is
- * used for the bitmap.
- */
-#define MAPSIZE (BLCKSZ - MAXALIGN(SizeOfPageHeaderData))
-
-/* Number of heap blocks we can represent in one byte */
-#define HEAPBLOCKS_PER_BYTE (BITS_PER_BYTE / BITS_PER_HEAPBLOCK)
-
-/* Number of heap blocks we can represent in one visibility map page. */
-#define HEAPBLOCKS_PER_PAGE (MAPSIZE * HEAPBLOCKS_PER_BYTE)
-
-/* Mapping from heap block number to the right bit in the visibility map */
-#define HEAPBLK_TO_MAPBLOCK(x) ((x) / HEAPBLOCKS_PER_PAGE)
-#define HEAPBLK_TO_MAPBYTE(x) (((x) % HEAPBLOCKS_PER_PAGE) / HEAPBLOCKS_PER_BYTE)
-#define HEAPBLK_TO_OFFSET(x) (((x) % HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK)
-
 /* Masks for counting subsets of bits in the visibility map. */
 #define VISIBLE_MASK64	UINT64CONST(0x5555555555555555) /* The lower bit of each
 														 * bit pair */
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index 858bcb6bc96..bb0c610d676 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -166,7 +166,7 @@ extern void simple_heap_delete(Relation relation, ItemPointer tid);
 extern void simple_heap_update(Relation relation, ItemPointer otid,
 							   HeapTuple tup);
 
-extern void heap_sync(Relation relation);
+extern void heap_sync(Relation relation, bool sync_vm);
 
 extern TransactionId heap_compute_xid_horizon_for_tuples(Relation rel,
 														 ItemPointerData *items,
diff --git a/src/include/access/visibilitymap.h b/src/include/access/visibilitymap.h
index 2d8804351ac..524abb4f2ca 100644
--- a/src/include/access/visibilitymap.h
+++ b/src/include/access/visibilitymap.h
@@ -34,6 +34,24 @@
 #define VM_ALL_FROZEN(r, b, v) \
 	((visibilitymap_get_status((r), (b), (v)) & VISIBILITYMAP_ALL_FROZEN) != 0)
 
+/*
+ * Size of the bitmap on each visibility map page, in bytes. There's no
+ * extra headers, so the whole page minus the standard page header is
+ * used for the bitmap.
+ */
+#define MAPSIZE (BLCKSZ - MAXALIGN(SizeOfPageHeaderData))
+
+/* Number of heap blocks we can represent in one byte */
+#define HEAPBLOCKS_PER_BYTE (BITS_PER_BYTE / BITS_PER_HEAPBLOCK)
+
+/* Number of heap blocks we can represent in one visibility map page. */
+#define HEAPBLOCKS_PER_PAGE (MAPSIZE * HEAPBLOCKS_PER_BYTE)
+
+/* Mapping from heap block number to the right bit in the visibility map */
+#define HEAPBLK_TO_MAPBLOCK(x) ((x) / HEAPBLOCKS_PER_PAGE)
+#define HEAPBLK_TO_MAPBYTE(x) (((x) % HEAPBLOCKS_PER_PAGE) / HEAPBLOCKS_PER_BYTE)
+#define HEAPBLK_TO_OFFSET(x) (((x) % HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK)
+
 extern bool visibilitymap_clear(Relation rel, BlockNumber heapBlk,
 								Buffer vmbuf, uint8 flags);
 extern void visibilitymap_pin(Relation rel, BlockNumber heapBlk,
#6Andres Freund
andres@anarazel.de
In reply to: Alexander Korotkov (#5)
Re: Write visibility map during CLUSTER/VACUUM FULL

Hi,

On 2019-09-13 22:22:50 +0300, Alexander Korotkov wrote:

On Thu, Sep 12, 2019 at 4:55 PM Alexander Korotkov
<a.korotkov@postgrespro.ru> wrote:

On Wed, Sep 11, 2019 at 3:30 PM Amit Kapila <amit.kapila16@gmail.com> wrote:

On Sun, Sep 1, 2019 at 1:37 PM Alexander Korotkov
<a.korotkov@postgrespro.ru> wrote:

I found it weird that CLUSTER/VACUUM FULL don't write visibility map.
Attached patch implements writing visibility map in
heapam_relation_copy_for_cluster().

I've studied previous attempt to implement this [1]. The main problem
of that attempt was usage of existing heap_page_is_all_visible() and
visibilitymap_set() functions. These functions works through buffer
manager, while heap rewriting is made bypass buffer manager.

In my patch visibility map pages are handled in the same way as heap
pages are.

Why do you want to do that? To me that doesn't immediately seems like a
good idea, and I've not seen justification for it in this thread. Did I
miss something?

I haven't studied this patch in detail, but while glancing I observed
that this doesn't try to sync the vm pages as we do for heap pages in
the end (during end_heap_rewrite). Am I missing something?

You're not missed anything. Yes, VM need sync. Will fix this. And I
just noticed I need a closer look to what is going on with TOAST.

Attached patch syncs VM during end_heap_rewrite().

However, VM for TOAST still isn't read. It appear to be much more
difficult to write VM for TOAST, because it's written by insertion
tuples one-by-one. Despite it seems to fill TOAST heap pages
sequentially (assuming no FSM exists yet), it's quite hard to handle
page-switching event with reasonable level of abstraction.
Nevertheless, I find this patch useful in current shape. Even if we
don't write VM for TOAST, it's still useful to do for regular heap.
Additionally, one of key advantages of having VM is index-only scan,
which don't work for TOAST anyway.

Have you looked at the discussion around
/messages/by-id/20190408010427.4l63qr7h2fjcyp77@alap3.anarazel.de
?

That's not quite the same thing as you're tackling here, but I wonder if
some of the logic could be the same. Especially for toast.

+/* Write contents of VM page */
+static void
+rewrite_flush_vm_page(RewriteState state)
+{
+	Assert(state->rs_vm_buffer_valid);
+
+	if (state->rs_use_wal)
+		log_newpage(&state->rs_new_rel->rd_node,
+					VISIBILITYMAP_FORKNUM,
+					state->rs_vm_blockno,
+					state->rs_vm_buffer,
+					true);
+	RelationOpenSmgr(state->rs_new_rel);
+
+	PageSetChecksumInplace(state->rs_vm_buffer, state->rs_vm_blockno);
+
+	smgrextend(state->rs_new_rel->rd_smgr, VISIBILITYMAP_FORKNUM,
+			   state->rs_vm_blockno, (char *) state->rs_vm_buffer, true);
+
+	state->rs_vm_buffer_valid = false;
+}
+
+/* Set VM flags to the VM page */
+static void
+rewrite_set_vm_flags(RewriteState state)
+{
+	BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(state->rs_blockno);
+	uint32		mapByte = HEAPBLK_TO_MAPBYTE(state->rs_blockno);
+	uint8		mapOffset = HEAPBLK_TO_OFFSET(state->rs_blockno);
+	char	   *map;
+	uint8		flags;
+
+	if (mapBlock != state->rs_vm_blockno && state->rs_vm_buffer_valid)
+		rewrite_flush_vm_page(state);
+
+	if (!state->rs_vm_buffer_valid)
+	{
+		PageInit(state->rs_vm_buffer, BLCKSZ, 0);
+		state->rs_vm_blockno = mapBlock;
+		state->rs_vm_buffer_valid = true;
+	}
+
+	flags = (state->rs_all_visible ? VISIBILITYMAP_ALL_VISIBLE : 0) |
+			(state->rs_all_frozen ? VISIBILITYMAP_ALL_FROZEN : 0);
+
+	map = PageGetContents(state->rs_vm_buffer);
+	map[mapByte] |= (flags << mapOffset);
+}

I think it's a bad idea to copy this many VM implementation details into
rewriteheap.c.

+/*
+ * Update rs_all_visible and rs_all_frozen flags according to the tuple.  We
+ * use simplified check assuming that HeapTupleSatisfiesVacuum() should already
+ * set tuple hint bits.
+ */
+static void
+rewrite_update_vm_flags(RewriteState state, HeapTuple tuple)
+{
+	TransactionId	xmin;
+
+	if (!state->rs_all_visible)
+		return;
+
+	if (!HeapTupleHeaderXminCommitted(tuple->t_data))
+	{
+		state->rs_all_visible = false;
+		state->rs_all_frozen = false;
+		return;
+	}
+
+	xmin = HeapTupleHeaderGetXmin(tuple->t_data);
+	if (!TransactionIdPrecedes(xmin, state->rs_oldest_xmin))
+	{
+		state->rs_all_visible = false;
+		state->rs_all_frozen = false;
+		return;
+	}
+
+	if (!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID) &&
+		!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask))
+	{
+		state->rs_all_visible = false;
+		state->rs_all_frozen = false;
+		return;
+	}
+
+	if (!state->rs_all_frozen)
+		return;
+
+	if (heap_tuple_needs_eventual_freeze(tuple->t_data))
+		state->rs_all_frozen = false;
+}

I don't think we should have yet another copy of logic determining
visibility. It has repeatedly proven hard to get right in the past, and
it'll not get better by having yet another copy. Especially not because
we've basically already done a HTSV (via
heapam_relation_copy_for_cluster), and we're now redoing a poor man's
version of it.

Greetings,

Andres Freund

#7Michael Paquier
michael@paquier.xyz
In reply to: Andres Freund (#6)
Re: Write visibility map during CLUSTER/VACUUM FULL

On Fri, Sep 20, 2019 at 01:05:06PM -0700, Andres Freund wrote:

I don't think we should have yet another copy of logic determining
visibility. It has repeatedly proven hard to get right in the past, and
it'll not get better by having yet another copy. Especially not because
we've basically already done a HTSV (via
heapam_relation_copy_for_cluster), and we're now redoing a poor man's
version of it.

These comments are unanswered for more than 2 months, so I am marking
this entry as returned with feedback. Please feel free to change if
you think that's not adapted.
--
Michael

#8Anna Akenteva
a.akenteva@postgrespro.ru
In reply to: Michael Paquier (#7)
2 attachment(s)
Re: Write visibility map during CLUSTER/VACUUM FULL

On 2019-11-29 05:32, Michael Paquier wrote:

These comments are unanswered for more than 2 months, so I am marking
this entry as returned with feedback.

I'd like to revive this patch. To make further work easier, attaching a
rebased version of the latest patch by Alexander.

As for having yet another copy of logic determining visibility: the
visibility check was mainly taken from heap_page_is_all_visible(), so I
refactored the code to make sure that logic is not duplicated. The
updated patch is attached too.

--
Anna Akenteva
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

Attachments:

0001-write-vm-during-cluster-3-rebase.patchtext/x-diff; name=0001-write-vm-during-cluster-3-rebase.patchDownload
commit b7482718231f0abf584cc65c5dcd19a91fefa512
Author: Akenteva Anna <a.akenteva@postgrespro.ru>
Date:   Mon Jun 28 08:22:53 2021 +0300

    0001-write-vm-during-cluster-3-rebase.patch

diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index 1aff62cd42..ecb73aa608 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -110,6 +110,7 @@
 #include "access/heaptoast.h"
 #include "access/rewriteheap.h"
 #include "access/transam.h"
+#include "access/visibilitymap.h"
 #include "access/xact.h"
 #include "access/xloginsert.h"
 #include "catalog/catalog.h"
@@ -152,6 +153,11 @@ typedef struct RewriteStateData
 	HTAB	   *rs_old_new_tid_map; /* unmatched B tuples */
 	HTAB	   *rs_logical_mappings;	/* logical remapping files */
 	uint32		rs_num_rewrite_mappings;	/* # in memory mappings */
+	Page		rs_vm_buffer;	/* visibility map page */
+	BlockNumber rs_vm_blockno;	/* block number of visibility page */
+	BlockNumber rs_vm_buffer_valid;	/* T if any bits are set */
+	bool		rs_all_visible;	/* all visible flag for rs_buffer */
+	bool		rs_all_frozen;	/* all frozen flag for rs_buffer */
 }			RewriteStateData;
 
 /*
@@ -213,6 +219,9 @@ typedef struct RewriteMappingDataEntry
 
 
 /* prototypes for internal functions */
+static void rewrite_flush_vm_page(RewriteState state);
+static void rewrite_set_vm_flags(RewriteState state);
+static void rewrite_update_vm_flags(RewriteState state, HeapTuple tuple);
 static void raw_heap_insert(RewriteState state, HeapTuple tup);
 
 /* internal logical remapping prototypes */
@@ -264,6 +273,11 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm
 	state->rs_freeze_xid = freeze_xid;
 	state->rs_cutoff_multi = cutoff_multi;
 	state->rs_cxt = rw_cxt;
+	state->rs_vm_buffer = (Page) palloc(BLCKSZ);
+	state->rs_vm_blockno = HEAPBLK_TO_MAPBLOCK(state->rs_blockno);
+	state->rs_vm_buffer_valid = false;
+	state->rs_all_visible = true;
+	state->rs_all_frozen = true;
 
 	/* Initialize hash tables used to track update chains */
 	hash_ctl.keysize = sizeof(TidHashKey);
@@ -284,6 +298,10 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm
 					&hash_ctl,
 					HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
 
+	RelationOpenSmgr(state->rs_new_rel);
+	if (!smgrexists(state->rs_new_rel->rd_smgr, VISIBILITYMAP_FORKNUM))
+		smgrcreate(state->rs_new_rel->rd_smgr, VISIBILITYMAP_FORKNUM, false);
+
 	MemoryContextSwitchTo(old_cxt);
 
 	logical_begin_heap_rewrite(state);
@@ -317,6 +335,9 @@ end_heap_rewrite(RewriteState state)
 	/* Write the last page, if any */
 	if (state->rs_buffer_valid)
 	{
+		if (state->rs_all_visible)
+			PageSetAllVisible(state->rs_buffer);
+
 		if (RelationNeedsWAL(state->rs_new_rel))
 			log_newpage(&state->rs_new_rel->rd_node,
 						MAIN_FORKNUM,
@@ -329,8 +350,14 @@ end_heap_rewrite(RewriteState state)
 		RelationOpenSmgr(state->rs_new_rel);
 		smgrextend(state->rs_new_rel->rd_smgr, MAIN_FORKNUM, state->rs_blockno,
 				   (char *) state->rs_buffer, true);
+
+		rewrite_set_vm_flags(state);
 	}
 
+	/* Write the last VM page too */
+	if (state->rs_vm_buffer_valid)
+		rewrite_flush_vm_page(state);
+
 	/*
 	 * When we WAL-logged rel pages, we must nonetheless fsync them.  The
 	 * reason is the same as in storage.c's RelationCopyStorage(): we're
@@ -343,6 +370,7 @@ end_heap_rewrite(RewriteState state)
 		/* for an empty table, this could be first smgr access */
 		RelationOpenSmgr(state->rs_new_rel);
 		smgrimmedsync(state->rs_new_rel->rd_smgr, MAIN_FORKNUM);
+		smgrimmedsync(state->rs_new_rel->rd_smgr, VISIBILITYMAP_FORKNUM);
 	}
 
 	logical_end_heap_rewrite(state);
@@ -351,6 +379,98 @@ end_heap_rewrite(RewriteState state)
 	MemoryContextDelete(state->rs_cxt);
 }
 
+/* Write contents of VM page */
+static void
+rewrite_flush_vm_page(RewriteState state)
+{
+	Assert(state->rs_vm_buffer_valid);
+
+	if (RelationNeedsWAL(state->rs_new_rel))
+		log_newpage(&state->rs_new_rel->rd_node,
+					VISIBILITYMAP_FORKNUM,
+					state->rs_vm_blockno,
+					state->rs_vm_buffer,
+					true);
+	RelationOpenSmgr(state->rs_new_rel);
+
+	PageSetChecksumInplace(state->rs_vm_buffer, state->rs_vm_blockno);
+
+	smgrextend(state->rs_new_rel->rd_smgr, VISIBILITYMAP_FORKNUM,
+			   state->rs_vm_blockno, (char *) state->rs_vm_buffer, true);
+
+	state->rs_vm_buffer_valid = false;
+}
+
+/* Set VM flags to the VM page */
+static void
+rewrite_set_vm_flags(RewriteState state)
+{
+	BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(state->rs_blockno);
+	uint32		mapByte = HEAPBLK_TO_MAPBYTE(state->rs_blockno);
+	uint8		mapOffset = HEAPBLK_TO_OFFSET(state->rs_blockno);
+	char	   *map;
+	uint8		flags;
+
+	if (mapBlock != state->rs_vm_blockno && state->rs_vm_buffer_valid)
+		rewrite_flush_vm_page(state);
+
+	if (!state->rs_vm_buffer_valid)
+	{
+		PageInit(state->rs_vm_buffer, BLCKSZ, 0);
+		state->rs_vm_blockno = mapBlock;
+		state->rs_vm_buffer_valid = true;
+	}
+
+	flags = (state->rs_all_visible ? VISIBILITYMAP_ALL_VISIBLE : 0) |
+			(state->rs_all_frozen ? VISIBILITYMAP_ALL_FROZEN : 0);
+
+	map = PageGetContents(state->rs_vm_buffer);
+	map[mapByte] |= (flags << mapOffset);
+}
+
+/*
+ * Update rs_all_visible and rs_all_frozen flags according to the tuple.  We
+ * use simplified check assuming that HeapTupleSatisfiesVacuum() should already
+ * set tuple hint bits.
+ */
+static void
+rewrite_update_vm_flags(RewriteState state, HeapTuple tuple)
+{
+	TransactionId	xmin;
+
+	if (!state->rs_all_visible)
+		return;
+
+	if (!HeapTupleHeaderXminCommitted(tuple->t_data))
+	{
+		state->rs_all_visible = false;
+		state->rs_all_frozen = false;
+		return;
+	}
+
+	xmin = HeapTupleHeaderGetXmin(tuple->t_data);
+	if (!TransactionIdPrecedes(xmin, state->rs_oldest_xmin))
+	{
+		state->rs_all_visible = false;
+		state->rs_all_frozen = false;
+		return;
+	}
+
+	if (!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID) &&
+		!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask))
+	{
+		state->rs_all_visible = false;
+		state->rs_all_frozen = false;
+		return;
+	}
+
+	if (!state->rs_all_frozen)
+		return;
+
+	if (heap_tuple_needs_eventual_freeze(tuple->t_data))
+		state->rs_all_frozen = false;
+}
+
 /*
  * Add a tuple to the new heap.
  *
@@ -477,6 +597,7 @@ rewrite_heap_tuple(RewriteState state,
 
 		/* Insert the tuple and find out where it's put in new_heap */
 		raw_heap_insert(state, new_tuple);
+		rewrite_update_vm_flags(state, new_tuple);
 		new_tid = new_tuple->t_self;
 
 		logical_rewrite_heap_tuple(state, old_tid, new_tuple);
@@ -682,6 +803,9 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
 			 * enforce saveFreeSpace unconditionally.
 			 */
 
+			if (state->rs_all_visible)
+				PageSetAllVisible(page);
+
 			/* XLOG stuff */
 			if (RelationNeedsWAL(state->rs_new_rel))
 				log_newpage(&state->rs_new_rel->rd_node,
@@ -702,6 +826,8 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
 			smgrextend(state->rs_new_rel->rd_smgr, MAIN_FORKNUM,
 					   state->rs_blockno, (char *) page, true);
 
+			rewrite_set_vm_flags(state);
+
 			state->rs_blockno++;
 			state->rs_buffer_valid = false;
 		}
diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
index e198df65d8..98e249b0c2 100644
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -99,24 +99,6 @@
 
 /*#define TRACE_VISIBILITYMAP */
 
-/*
- * Size of the bitmap on each visibility map page, in bytes. There's no
- * extra headers, so the whole page minus the standard page header is
- * used for the bitmap.
- */
-#define MAPSIZE (BLCKSZ - MAXALIGN(SizeOfPageHeaderData))
-
-/* Number of heap blocks we can represent in one byte */
-#define HEAPBLOCKS_PER_BYTE (BITS_PER_BYTE / BITS_PER_HEAPBLOCK)
-
-/* Number of heap blocks we can represent in one visibility map page. */
-#define HEAPBLOCKS_PER_PAGE (MAPSIZE * HEAPBLOCKS_PER_BYTE)
-
-/* Mapping from heap block number to the right bit in the visibility map */
-#define HEAPBLK_TO_MAPBLOCK(x) ((x) / HEAPBLOCKS_PER_PAGE)
-#define HEAPBLK_TO_MAPBYTE(x) (((x) % HEAPBLOCKS_PER_PAGE) / HEAPBLOCKS_PER_BYTE)
-#define HEAPBLK_TO_OFFSET(x) (((x) % HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK)
-
 /* Masks for counting subsets of bits in the visibility map. */
 #define VISIBLE_MASK64	UINT64CONST(0x5555555555555555) /* The lower bit of each
 														 * bit pair */
diff --git a/src/include/access/visibilitymap.h b/src/include/access/visibilitymap.h
index 57362c3687..39f3d59156 100644
--- a/src/include/access/visibilitymap.h
+++ b/src/include/access/visibilitymap.h
@@ -34,6 +34,24 @@
 #define VM_ALL_FROZEN(r, b, v) \
 	((visibilitymap_get_status((r), (b), (v)) & VISIBILITYMAP_ALL_FROZEN) != 0)
 
+/*
+ * Size of the bitmap on each visibility map page, in bytes. There's no
+ * extra headers, so the whole page minus the standard page header is
+ * used for the bitmap.
+ */
+#define MAPSIZE (BLCKSZ - MAXALIGN(SizeOfPageHeaderData))
+
+/* Number of heap blocks we can represent in one byte */
+#define HEAPBLOCKS_PER_BYTE (BITS_PER_BYTE / BITS_PER_HEAPBLOCK)
+
+/* Number of heap blocks we can represent in one visibility map page. */
+#define HEAPBLOCKS_PER_PAGE (MAPSIZE * HEAPBLOCKS_PER_BYTE)
+
+/* Mapping from heap block number to the right bit in the visibility map */
+#define HEAPBLK_TO_MAPBLOCK(x) ((x) / HEAPBLOCKS_PER_PAGE)
+#define HEAPBLK_TO_MAPBYTE(x) (((x) % HEAPBLOCKS_PER_PAGE) / HEAPBLOCKS_PER_BYTE)
+#define HEAPBLK_TO_OFFSET(x) (((x) % HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK)
+
 extern bool visibilitymap_clear(Relation rel, BlockNumber heapBlk,
 								Buffer vmbuf, uint8 flags);
 extern void visibilitymap_pin(Relation rel, BlockNumber heapBlk,
0001-write-vm-during-cluster-4.patchtext/x-diff; name=0001-write-vm-during-cluster-4.patchDownload
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index 1b8b640012..f9cfc5d911 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -47,7 +47,8 @@
 
 static void reform_and_rewrite_tuple(HeapTuple tuple,
 									 Relation OldHeap, Relation NewHeap,
-									 Datum *values, bool *isnull, RewriteState rwstate);
+									 Datum *values, bool *isnull, bool islive,
+									 RewriteState rwstate);
 
 static bool SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer,
 								   HeapTuple tuple,
@@ -779,6 +780,7 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
 		HeapTuple	tuple;
 		Buffer		buf;
 		bool		isdead;
+		bool		islive = false;
 
 		CHECK_FOR_INTERRUPTS();
 
@@ -845,6 +847,7 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
 			case HEAPTUPLE_LIVE:
 				/* Live or recently dead, must copy it */
 				isdead = false;
+				islive = true;
 				break;
 			case HEAPTUPLE_INSERT_IN_PROGRESS:
 
@@ -900,7 +903,7 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
 		*num_tuples += 1;
 		if (tuplesort != NULL)
 		{
-			tuplesort_putheaptuple(tuplesort, tuple);
+			tuplesort_putheaptuple(tuplesort, tuple, islive);
 
 			/*
 			 * In scan-and-sort mode, report increase in number of tuples
@@ -918,7 +921,7 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
 			int64		ct_val[2];
 
 			reform_and_rewrite_tuple(tuple, OldHeap, NewHeap,
-									 values, isnull, rwstate);
+									 values, isnull, islive, rwstate);
 
 			/*
 			 * In indexscan mode and also VACUUM FULL, report increase in
@@ -958,17 +961,18 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
 		for (;;)
 		{
 			HeapTuple	tuple;
+			bool		islive;
 
 			CHECK_FOR_INTERRUPTS();
 
-			tuple = tuplesort_getheaptuple(tuplesort, true);
+			tuple = tuplesort_getheaptuple(tuplesort, true, &islive);
 			if (tuple == NULL)
 				break;
 
 			n_tuples += 1;
 			reform_and_rewrite_tuple(tuple,
 									 OldHeap, NewHeap,
-									 values, isnull,
+									 values, isnull, islive,
 									 rwstate);
 			/* Report n_tuples */
 			pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_TUPLES_WRITTEN,
@@ -2455,7 +2459,8 @@ heapam_scan_sample_next_tuple(TableScanDesc scan, SampleScanState *scanstate,
 static void
 reform_and_rewrite_tuple(HeapTuple tuple,
 						 Relation OldHeap, Relation NewHeap,
-						 Datum *values, bool *isnull, RewriteState rwstate)
+						 Datum *values, bool *isnull, bool islive,
+						 RewriteState rwstate)
 {
 	TupleDesc	oldTupDesc = RelationGetDescr(OldHeap);
 	TupleDesc	newTupDesc = RelationGetDescr(NewHeap);
@@ -2474,7 +2479,7 @@ reform_and_rewrite_tuple(HeapTuple tuple,
 	copiedTuple = heap_form_tuple(newTupDesc, values, isnull);
 
 	/* The heap rewrite module does the rest */
-	rewrite_heap_tuple(rwstate, tuple, copiedTuple);
+	rewrite_heap_tuple(rwstate, islive, tuple, copiedTuple);
 
 	heap_freetuple(copiedTuple);
 }
diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index 1aff62cd42..dc136d2a51 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -110,6 +110,7 @@
 #include "access/heaptoast.h"
 #include "access/rewriteheap.h"
 #include "access/transam.h"
+#include "access/visibilitymap.h"
 #include "access/xact.h"
 #include "access/xloginsert.h"
 #include "catalog/catalog.h"
@@ -152,6 +153,11 @@ typedef struct RewriteStateData
 	HTAB	   *rs_old_new_tid_map; /* unmatched B tuples */
 	HTAB	   *rs_logical_mappings;	/* logical remapping files */
 	uint32		rs_num_rewrite_mappings;	/* # in memory mappings */
+	Page		rs_vm_buffer;	/* visibility map page */
+	BlockNumber rs_vm_blockno;	/* block number of visibility page */
+	BlockNumber rs_vm_buffer_valid;	/* T if any bits are set */
+	bool		rs_all_visible;	/* all visible flag for rs_buffer */
+	bool		rs_all_frozen;	/* all frozen flag for rs_buffer */
 }			RewriteStateData;
 
 /*
@@ -213,6 +219,8 @@ typedef struct RewriteMappingDataEntry
 
 
 /* prototypes for internal functions */
+static void rewrite_flush_vm_page(RewriteState state);
+static void rewrite_set_vm_flags(RewriteState state);
 static void raw_heap_insert(RewriteState state, HeapTuple tup);
 
 /* internal logical remapping prototypes */
@@ -264,6 +272,11 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm
 	state->rs_freeze_xid = freeze_xid;
 	state->rs_cutoff_multi = cutoff_multi;
 	state->rs_cxt = rw_cxt;
+	state->rs_vm_buffer = (Page) palloc(BLCKSZ);
+	state->rs_vm_blockno = HEAPBLK_TO_MAPBLOCK(state->rs_blockno);
+	state->rs_vm_buffer_valid = false;
+	state->rs_all_visible = true;
+	state->rs_all_frozen = true;
 
 	/* Initialize hash tables used to track update chains */
 	hash_ctl.keysize = sizeof(TidHashKey);
@@ -284,6 +297,10 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm
 					&hash_ctl,
 					HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
 
+	RelationOpenSmgr(state->rs_new_rel);
+	if (!smgrexists(state->rs_new_rel->rd_smgr, VISIBILITYMAP_FORKNUM))
+		smgrcreate(state->rs_new_rel->rd_smgr, VISIBILITYMAP_FORKNUM, false);
+
 	MemoryContextSwitchTo(old_cxt);
 
 	logical_begin_heap_rewrite(state);
@@ -317,6 +334,9 @@ end_heap_rewrite(RewriteState state)
 	/* Write the last page, if any */
 	if (state->rs_buffer_valid)
 	{
+		if (state->rs_all_visible)
+			PageSetAllVisible(state->rs_buffer);
+
 		if (RelationNeedsWAL(state->rs_new_rel))
 			log_newpage(&state->rs_new_rel->rd_node,
 						MAIN_FORKNUM,
@@ -329,8 +349,14 @@ end_heap_rewrite(RewriteState state)
 		RelationOpenSmgr(state->rs_new_rel);
 		smgrextend(state->rs_new_rel->rd_smgr, MAIN_FORKNUM, state->rs_blockno,
 				   (char *) state->rs_buffer, true);
+
+		rewrite_set_vm_flags(state);
 	}
 
+	/* Write the last VM page too */
+	if (state->rs_vm_buffer_valid)
+		rewrite_flush_vm_page(state);
+
 	/*
 	 * When we WAL-logged rel pages, we must nonetheless fsync them.  The
 	 * reason is the same as in storage.c's RelationCopyStorage(): we're
@@ -343,6 +369,7 @@ end_heap_rewrite(RewriteState state)
 		/* for an empty table, this could be first smgr access */
 		RelationOpenSmgr(state->rs_new_rel);
 		smgrimmedsync(state->rs_new_rel->rd_smgr, MAIN_FORKNUM);
+		smgrimmedsync(state->rs_new_rel->rd_smgr, VISIBILITYMAP_FORKNUM);
 	}
 
 	logical_end_heap_rewrite(state);
@@ -351,6 +378,55 @@ end_heap_rewrite(RewriteState state)
 	MemoryContextDelete(state->rs_cxt);
 }
 
+/* Write contents of VM page */
+static void
+rewrite_flush_vm_page(RewriteState state)
+{
+	Assert(state->rs_vm_buffer_valid);
+
+	if (RelationNeedsWAL(state->rs_new_rel))
+		log_newpage(&state->rs_new_rel->rd_node,
+					VISIBILITYMAP_FORKNUM,
+					state->rs_vm_blockno,
+					state->rs_vm_buffer,
+					true);
+	RelationOpenSmgr(state->rs_new_rel);
+
+	PageSetChecksumInplace(state->rs_vm_buffer, state->rs_vm_blockno);
+
+	smgrextend(state->rs_new_rel->rd_smgr, VISIBILITYMAP_FORKNUM,
+			   state->rs_vm_blockno, (char *) state->rs_vm_buffer, true);
+
+	state->rs_vm_buffer_valid = false;
+}
+
+/* Set VM flags to the VM page */
+static void
+rewrite_set_vm_flags(RewriteState state)
+{
+	BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(state->rs_blockno);
+	uint32		mapByte = HEAPBLK_TO_MAPBYTE(state->rs_blockno);
+	uint8		mapOffset = HEAPBLK_TO_OFFSET(state->rs_blockno);
+	char	   *map;
+	uint8		flags;
+
+	if (mapBlock != state->rs_vm_blockno && state->rs_vm_buffer_valid)
+		rewrite_flush_vm_page(state);
+
+	if (!state->rs_vm_buffer_valid)
+	{
+		PageInit(state->rs_vm_buffer, BLCKSZ, 0);
+		state->rs_vm_blockno = mapBlock;
+		state->rs_vm_buffer_valid = true;
+	}
+
+	flags = (state->rs_all_visible ? VISIBILITYMAP_ALL_VISIBLE : 0) |
+			(state->rs_all_frozen ? VISIBILITYMAP_ALL_FROZEN : 0);
+
+	map = PageGetContents(state->rs_vm_buffer);
+	map[mapByte] |= (flags << mapOffset);
+}
+
 /*
  * Add a tuple to the new heap.
  *
@@ -359,11 +435,12 @@ end_heap_rewrite(RewriteState state)
  * it had better be temp storage not a pointer to the original tuple.
  *
  * state		opaque state as returned by begin_heap_rewrite
+ * is_live		whether the currently processed tuple is live
  * old_tuple	original tuple in the old heap
  * new_tuple	new, rewritten tuple to be inserted to new heap
  */
 void
-rewrite_heap_tuple(RewriteState state,
+rewrite_heap_tuple(RewriteState state, bool is_live,
 				   HeapTuple old_tuple, HeapTuple new_tuple)
 {
 	MemoryContext old_cxt;
@@ -477,6 +554,10 @@ rewrite_heap_tuple(RewriteState state,
 
 		/* Insert the tuple and find out where it's put in new_heap */
 		raw_heap_insert(state, new_tuple);
+		heap_page_update_vm_flags(new_tuple, state->rs_oldest_xmin,
+								 is_live, true,
+								 &(state->rs_all_visible),
+								 &(state->rs_all_frozen));
 		new_tid = new_tuple->t_self;
 
 		logical_rewrite_heap_tuple(state, old_tid, new_tuple);
@@ -682,6 +763,9 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
 			 * enforce saveFreeSpace unconditionally.
 			 */
 
+			if (state->rs_all_visible)
+				PageSetAllVisible(page);
+
 			/* XLOG stuff */
 			if (RelationNeedsWAL(state->rs_new_rel))
 				log_newpage(&state->rs_new_rel->rd_node,
@@ -702,6 +786,8 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
 			smgrextend(state->rs_new_rel->rd_smgr, MAIN_FORKNUM,
 					   state->rs_blockno, (char *) page, true);
 
+			rewrite_set_vm_flags(state);
+
 			state->rs_blockno++;
 			state->rs_buffer_valid = false;
 		}
diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index 44f198398c..9feef917cd 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -3610,6 +3610,53 @@ vac_cmp_itemptr(const void *left, const void *right)
 	return 0;
 }
 
+/*
+ * Update all_visible and all_frozen flags according to the tuple.  We
+ * use simplified check assuming that HeapTupleSatisfiesVacuum() should already
+ * set tuple hint bits.
+ */
+void
+heap_page_update_vm_flags(HeapTuple tuple, TransactionId OldestXmin,
+						  bool is_live, bool check_infomask,
+						  bool *all_visible, bool *all_frozen)
+{
+	TransactionId xmin;
+
+	/* Check comments in lazy_scan_heap. */
+	if (!HeapTupleHeaderXminCommitted(tuple->t_data) || !is_live)
+	{
+		*all_visible = false;
+		*all_frozen = false;
+		return;
+	}
+
+	/*
+	 * The inserter definitely committed. But is it old enough
+	 * that everyone sees it as committed?
+	 */
+	xmin = HeapTupleHeaderGetXmin(tuple->t_data);
+	if (!TransactionIdPrecedes(xmin, OldestXmin))
+	{
+		*all_visible = false;
+		*all_frozen = false;
+		return;
+	}
+
+	if (check_infomask &&
+		!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID) &&
+		!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask))
+	{
+		*all_visible = false;
+		*all_frozen = false;
+		return;
+	}
+
+	/* Check whether this tuple is already frozen or not */
+	if (*all_visible && *all_frozen &&
+		heap_tuple_needs_eventual_freeze(tuple->t_data))
+		*all_frozen = false;
+}
+
 /*
  * Check if every tuple in the given page is visible to all current and future
  * transactions. Also return the visibility_cutoff_xid which is the highest
@@ -3678,36 +3725,16 @@ heap_page_is_all_visible(LVRelState *vacrel, Buffer buf,
 				{
 					TransactionId xmin;
 
-					/* Check comments in lazy_scan_heap. */
-					if (!HeapTupleHeaderXminCommitted(tuple.t_data))
-					{
-						all_visible = false;
-						*all_frozen = false;
-						break;
-					}
-
-					/*
-					 * The inserter definitely committed. But is it old enough
-					 * that everyone sees it as committed?
-					 */
-					xmin = HeapTupleHeaderGetXmin(tuple.t_data);
-					if (!TransactionIdPrecedes(xmin, vacrel->OldestXmin))
-					{
-						all_visible = false;
-						*all_frozen = false;
-						break;
-					}
+					heap_page_update_vm_flags(&tuple, vacrel->OldestXmin,
+											  true, false,
+											  &all_visible, all_frozen);
 
 					/* Track newest xmin on page. */
+					xmin = HeapTupleHeaderGetXmin(tuple.t_data);
 					if (TransactionIdFollows(xmin, *visibility_cutoff_xid))
 						*visibility_cutoff_xid = xmin;
-
-					/* Check whether this tuple is already frozen or not */
-					if (all_visible && *all_frozen &&
-						heap_tuple_needs_eventual_freeze(tuple.t_data))
-						*all_frozen = false;
+					break;
 				}
-				break;
 
 			case HEAPTUPLE_DEAD:
 			case HEAPTUPLE_RECENTLY_DEAD:
diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
index e198df65d8..98e249b0c2 100644
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -99,24 +99,6 @@
 
 /*#define TRACE_VISIBILITYMAP */
 
-/*
- * Size of the bitmap on each visibility map page, in bytes. There's no
- * extra headers, so the whole page minus the standard page header is
- * used for the bitmap.
- */
-#define MAPSIZE (BLCKSZ - MAXALIGN(SizeOfPageHeaderData))
-
-/* Number of heap blocks we can represent in one byte */
-#define HEAPBLOCKS_PER_BYTE (BITS_PER_BYTE / BITS_PER_HEAPBLOCK)
-
-/* Number of heap blocks we can represent in one visibility map page. */
-#define HEAPBLOCKS_PER_PAGE (MAPSIZE * HEAPBLOCKS_PER_BYTE)
-
-/* Mapping from heap block number to the right bit in the visibility map */
-#define HEAPBLK_TO_MAPBLOCK(x) ((x) / HEAPBLOCKS_PER_PAGE)
-#define HEAPBLK_TO_MAPBYTE(x) (((x) % HEAPBLOCKS_PER_PAGE) / HEAPBLOCKS_PER_BYTE)
-#define HEAPBLK_TO_OFFSET(x) (((x) % HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK)
-
 /* Masks for counting subsets of bits in the visibility map. */
 #define VISIBLE_MASK64	UINT64CONST(0x5555555555555555) /* The lower bit of each
 														 * bit pair */
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index 22972071ff..9ec922a047 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -180,6 +180,7 @@ typedef struct
 	Datum		datum1;			/* value of first key column */
 	bool		isnull1;		/* is first key column NULL? */
 	int			srctape;		/* source tape number */
+	bool		is_live;		/* is the tuple alive? */
 } SortTuple;
 
 /*
@@ -1703,7 +1704,7 @@ tuplesort_puttupleslot(Tuplesortstate *state, TupleTableSlot *slot)
  * Note that the input data is always copied; the caller need not save it.
  */
 void
-tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup)
+tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup, bool is_live)
 {
 	MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
 	SortTuple	stup;
@@ -1715,6 +1716,7 @@ tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup)
 	COPYTUP(state, &stup, (void *) tup);
 
 	puttuple_common(state, &stup);
+	stup.is_live = is_live;
 
 	MemoryContextSwitchTo(oldcontext);
 }
@@ -2442,7 +2444,7 @@ tuplesort_gettupleslot(Tuplesortstate *state, bool forward, bool copy,
  * remaining valid after any further manipulation of tuplesort.
  */
 HeapTuple
-tuplesort_getheaptuple(Tuplesortstate *state, bool forward)
+tuplesort_getheaptuple(Tuplesortstate *state, bool forward, bool *is_live)
 {
 	MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
 	SortTuple	stup;
@@ -2452,6 +2454,9 @@ tuplesort_getheaptuple(Tuplesortstate *state, bool forward)
 
 	MemoryContextSwitchTo(oldcontext);
 
+	if (is_live)
+		*is_live = stup.is_live;
+
 	return stup.tuple;
 }
 
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index e63b49fc38..71bda855f2 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -199,6 +199,9 @@ struct VacuumParams;
 extern void heap_vacuum_rel(Relation rel,
 							struct VacuumParams *params, BufferAccessStrategy bstrategy);
 extern void parallel_vacuum_main(dsm_segment *seg, shm_toc *toc);
+extern void heap_page_update_vm_flags(HeapTuple tuple, TransactionId OldestXmin,
+									  bool is_live, bool check_infomask,
+									  bool *all_visible, bool *all_frozen);
 
 /* in heap/heapam_visibility.c */
 extern bool HeapTupleSatisfiesVisibility(HeapTuple stup, Snapshot snapshot,
diff --git a/src/include/access/rewriteheap.h b/src/include/access/rewriteheap.h
index 121f552405..2cda7320d3 100644
--- a/src/include/access/rewriteheap.h
+++ b/src/include/access/rewriteheap.h
@@ -25,8 +25,8 @@ extern RewriteState begin_heap_rewrite(Relation OldHeap, Relation NewHeap,
 									   TransactionId OldestXmin, TransactionId FreezeXid,
 									   MultiXactId MultiXactCutoff);
 extern void end_heap_rewrite(RewriteState state);
-extern void rewrite_heap_tuple(RewriteState state, HeapTuple oldTuple,
-							   HeapTuple newTuple);
+extern void rewrite_heap_tuple(RewriteState state, bool is_live,
+							   HeapTuple oldTuple, HeapTuple newTuple);
 extern bool rewrite_heap_dead_tuple(RewriteState state, HeapTuple oldTuple);
 
 /*
diff --git a/src/include/access/visibilitymap.h b/src/include/access/visibilitymap.h
index 57362c3687..39f3d59156 100644
--- a/src/include/access/visibilitymap.h
+++ b/src/include/access/visibilitymap.h
@@ -34,6 +34,24 @@
 #define VM_ALL_FROZEN(r, b, v) \
 	((visibilitymap_get_status((r), (b), (v)) & VISIBILITYMAP_ALL_FROZEN) != 0)
 
+/*
+ * Size of the bitmap on each visibility map page, in bytes. There's no
+ * extra headers, so the whole page minus the standard page header is
+ * used for the bitmap.
+ */
+#define MAPSIZE (BLCKSZ - MAXALIGN(SizeOfPageHeaderData))
+
+/* Number of heap blocks we can represent in one byte */
+#define HEAPBLOCKS_PER_BYTE (BITS_PER_BYTE / BITS_PER_HEAPBLOCK)
+
+/* Number of heap blocks we can represent in one visibility map page. */
+#define HEAPBLOCKS_PER_PAGE (MAPSIZE * HEAPBLOCKS_PER_BYTE)
+
+/* Mapping from heap block number to the right bit in the visibility map */
+#define HEAPBLK_TO_MAPBLOCK(x) ((x) / HEAPBLOCKS_PER_PAGE)
+#define HEAPBLK_TO_MAPBYTE(x) (((x) % HEAPBLOCKS_PER_PAGE) / HEAPBLOCKS_PER_BYTE)
+#define HEAPBLK_TO_OFFSET(x) (((x) % HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK)
+
 extern bool visibilitymap_clear(Relation rel, BlockNumber heapBlk,
 								Buffer vmbuf, uint8 flags);
 extern void visibilitymap_pin(Relation rel, BlockNumber heapBlk,
diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h
index f94949370b..45442c4351 100644
--- a/src/include/utils/tuplesort.h
+++ b/src/include/utils/tuplesort.h
@@ -232,7 +232,7 @@ extern bool tuplesort_used_bound(Tuplesortstate *state);
 
 extern void tuplesort_puttupleslot(Tuplesortstate *state,
 								   TupleTableSlot *slot);
-extern void tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup);
+extern void tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup, bool is_live);
 extern void tuplesort_putindextuplevalues(Tuplesortstate *state,
 										  Relation rel, ItemPointer self,
 										  Datum *values, bool *isnull);
@@ -243,7 +243,7 @@ extern void tuplesort_performsort(Tuplesortstate *state);
 
 extern bool tuplesort_gettupleslot(Tuplesortstate *state, bool forward,
 								   bool copy, TupleTableSlot *slot, Datum *abbrev);
-extern HeapTuple tuplesort_getheaptuple(Tuplesortstate *state, bool forward);
+extern HeapTuple tuplesort_getheaptuple(Tuplesortstate *state, bool forward, bool *is_live);
 extern IndexTuple tuplesort_getindextuple(Tuplesortstate *state, bool forward);
 extern bool tuplesort_getdatum(Tuplesortstate *state, bool forward,
 							   Datum *val, bool *isNull, Datum *abbrev);
#9vignesh C
vignesh21@gmail.com
In reply to: Anna Akenteva (#8)
Re: Write visibility map during CLUSTER/VACUUM FULL

On Mon, Jun 28, 2021 at 1:52 PM Anna Akenteva <a.akenteva@postgrespro.ru> wrote:

On 2019-11-29 05:32, Michael Paquier wrote:

These comments are unanswered for more than 2 months, so I am marking
this entry as returned with feedback.

I'd like to revive this patch. To make further work easier, attaching a
rebased version of the latest patch by Alexander.

As for having yet another copy of logic determining visibility: the
visibility check was mainly taken from heap_page_is_all_visible(), so I
refactored the code to make sure that logic is not duplicated. The
updated patch is attached too.

The patch does not apply on Head anymore, could you rebase and post a
patch. I'm changing the status to "Waiting for Author".

Regards,
Vignesh

#10Justin Pryzby
pryzby@telsasoft.com
In reply to: Anna Akenteva (#8)
3 attachment(s)
Re: Write visibility map during CLUSTER/VACUUM FULL

On Mon, Jun 28, 2021 at 11:22:01AM +0300, Anna Akenteva wrote:

On 2019-11-29 05:32, Michael Paquier wrote:

These comments are unanswered for more than 2 months, so I am marking
this entry as returned with feedback.

I'd like to revive this patch. To make further work easier, attaching a
rebased version of the latest patch by Alexander.

As for having yet another copy of logic determining visibility: the
visibility check was mainly taken from heap_page_is_all_visible(), so I
refactored the code to make sure that logic is not duplicated. The updated
patch is attached too.

I agree that it's strange that VACUUM(FREEZE) freezes tuples but not VM bits,
nor page-level PD_ALL_VISIBLE (which is implied by all frozen). After FULL
runs (taking an exclusive lock on the table), it's necessary to then vacuum the
table again to get what's intended.

Rebased on f10f0ae420ee62400876ab34dca2c09c20dcd030.

And rephrased Anna's two independent/alternate patches as a 2nd patch on top of
the 1st, as that helps me to read it and reduces its total size.

I noticed in testing the patch that autovacuum is still hitting the relation
shortly after vacuum full. This is due to n_ins_since_autovacuum, which is new
in pg13. I don't know how to address that (or even if it should be addressed
at all).

Also, pg_class.relallvisible is not set unless vacuum/analyze is run again
(which is mitigated by the n_ins behavior above). It seems like this might be
important: an plan which uses index-only scan might be missed in favour of
something else until autovacuum runs (it will use cost-based delay, and might
not be scheduled immediately, could be interrupted, or even diasbled).

I'm testing like this:
CREATE EXTENSION IF NOT EXISTS pg_visibility ; DROP TABLE t; CREATE TABLE t AS SELECT generate_series(1,99999); VACUUM FULL t; ANALYZE t; SELECT n_ins_since_vacuum, n_tup_upd, n_tup_del, n_tup_hot_upd FROM pg_stat_all_tables WHERE relname='t'; SELECT relpages, relallvisible FROM pg_class WHERE oid='t'::regclass; SELECT COUNT(1), COUNT(1)FILTER(WHERE all_visible='t') allvis, COUNT(1)FILTER(WHERE all_frozen='t') allfrz, COUNT(1)FILTER(WHERE pd_all_visible='t') allvispd FROM pg_visibility('t');

--
Justin

Attachments:

0001-VACUUM-FULL-CLUSTER-set-VM-and-page-level-visibility.patchtext/x-diff; charset=us-asciiDownload
From 4ec10accb276d9ecde461c6cde0be5a610ed3dc6 Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Sun, 29 Aug 2021 15:57:09 -0500
Subject: [PATCH 1/3] VACUUM FULL/CLUSTER: set VM and page-level visibility
 bits

From: 	Alexander Korotkov
write-vm-during-cluster-3-rebase
---
 src/backend/access/heap/rewriteheap.c   | 131 +++++++++++++++++++++++-
 src/backend/access/heap/visibilitymap.c |  18 ----
 src/include/access/visibilitymap.h      |  18 ++++
 3 files changed, 147 insertions(+), 20 deletions(-)

diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index 986a776bbd..af0d1b2d0c 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -110,6 +110,7 @@
 #include "access/heaptoast.h"
 #include "access/rewriteheap.h"
 #include "access/transam.h"
+#include "access/visibilitymap.h"
 #include "access/xact.h"
 #include "access/xloginsert.h"
 #include "catalog/catalog.h"
@@ -152,6 +153,11 @@ typedef struct RewriteStateData
 	HTAB	   *rs_old_new_tid_map; /* unmatched B tuples */
 	HTAB	   *rs_logical_mappings;	/* logical remapping files */
 	uint32		rs_num_rewrite_mappings;	/* # in memory mappings */
+	Page		rs_vm_buffer;	/* visibility map page */
+	BlockNumber rs_vm_blockno;	/* block number of visibility page */
+	BlockNumber rs_vm_buffer_valid;	/* T if any bits are set */
+	bool		rs_all_visible;	/* all visible flag for rs_buffer */
+	bool		rs_all_frozen;	/* all frozen flag for rs_buffer */
 }			RewriteStateData;
 
 /*
@@ -213,6 +219,9 @@ typedef struct RewriteMappingDataEntry
 
 
 /* prototypes for internal functions */
+static void rewrite_flush_vm_page(RewriteState state);
+static void rewrite_set_vm_flags(RewriteState state);
+static void rewrite_update_vm_flags(RewriteState state, HeapTuple tuple);
 static void raw_heap_insert(RewriteState state, HeapTuple tup);
 
 /* internal logical remapping prototypes */
@@ -264,6 +273,11 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm
 	state->rs_freeze_xid = freeze_xid;
 	state->rs_cutoff_multi = cutoff_multi;
 	state->rs_cxt = rw_cxt;
+	state->rs_vm_buffer = (Page) palloc(BLCKSZ);
+	state->rs_vm_blockno = HEAPBLK_TO_MAPBLOCK(state->rs_blockno);
+	state->rs_vm_buffer_valid = false;
+	state->rs_all_visible = true;
+	state->rs_all_frozen = true;
 
 	/* Initialize hash tables used to track update chains */
 	hash_ctl.keysize = sizeof(TidHashKey);
@@ -284,6 +298,9 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm
 					&hash_ctl,
 					HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
 
+	if (!smgrexists(RelationGetSmgr(state->rs_new_rel), VISIBILITYMAP_FORKNUM))
+		smgrcreate(RelationGetSmgr(state->rs_new_rel), VISIBILITYMAP_FORKNUM, false);
+
 	MemoryContextSwitchTo(old_cxt);
 
 	logical_begin_heap_rewrite(state);
@@ -317,6 +334,9 @@ end_heap_rewrite(RewriteState state)
 	/* Write the last page, if any */
 	if (state->rs_buffer_valid)
 	{
+		if (state->rs_all_visible)
+			PageSetAllVisible(state->rs_buffer);
+
 		if (RelationNeedsWAL(state->rs_new_rel))
 			log_newpage(&state->rs_new_rel->rd_node,
 						MAIN_FORKNUM,
@@ -326,10 +346,16 @@ end_heap_rewrite(RewriteState state)
 
 		PageSetChecksumInplace(state->rs_buffer, state->rs_blockno);
 
-		smgrextend(RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM,
-				   state->rs_blockno, (char *) state->rs_buffer, true);
+		smgrextend(RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM, state->rs_blockno,
+				   (char *) state->rs_buffer, true);
+
+		rewrite_set_vm_flags(state);
 	}
 
+	/* Write the last VM page too */
+	if (state->rs_vm_buffer_valid)
+		rewrite_flush_vm_page(state);
+
 	/*
 	 * When we WAL-logged rel pages, we must nonetheless fsync them.  The
 	 * reason is the same as in storage.c's RelationCopyStorage(): we're
@@ -338,7 +364,11 @@ end_heap_rewrite(RewriteState state)
 	 * wrote before the checkpoint.
 	 */
 	if (RelationNeedsWAL(state->rs_new_rel))
+	{
+		/* for an empty table, this could be first smgr access */
 		smgrimmedsync(RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM);
+		smgrimmedsync(RelationGetSmgr(state->rs_new_rel), VISIBILITYMAP_FORKNUM);
+	}
 
 	logical_end_heap_rewrite(state);
 
@@ -346,6 +376,97 @@ end_heap_rewrite(RewriteState state)
 	MemoryContextDelete(state->rs_cxt);
 }
 
+/* Write contents of VM page */
+static void
+rewrite_flush_vm_page(RewriteState state)
+{
+	Assert(state->rs_vm_buffer_valid);
+
+	if (RelationNeedsWAL(state->rs_new_rel))
+		log_newpage(&state->rs_new_rel->rd_node,
+					VISIBILITYMAP_FORKNUM,
+					state->rs_vm_blockno,
+					state->rs_vm_buffer,
+					true);
+
+	PageSetChecksumInplace(state->rs_vm_buffer, state->rs_vm_blockno);
+
+	smgrextend(RelationGetSmgr(state->rs_new_rel), VISIBILITYMAP_FORKNUM,
+			   state->rs_vm_blockno, (char *) state->rs_vm_buffer, true);
+
+	state->rs_vm_buffer_valid = false;
+}
+
+/* Set VM flags to the VM page */
+static void
+rewrite_set_vm_flags(RewriteState state)
+{
+	BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(state->rs_blockno);
+	uint32		mapByte = HEAPBLK_TO_MAPBYTE(state->rs_blockno);
+	uint8		mapOffset = HEAPBLK_TO_OFFSET(state->rs_blockno);
+	char	   *map;
+	uint8		flags;
+
+	if (mapBlock != state->rs_vm_blockno && state->rs_vm_buffer_valid)
+		rewrite_flush_vm_page(state);
+
+	if (!state->rs_vm_buffer_valid)
+	{
+		PageInit(state->rs_vm_buffer, BLCKSZ, 0);
+		state->rs_vm_blockno = mapBlock;
+		state->rs_vm_buffer_valid = true;
+	}
+
+	flags = (state->rs_all_visible ? VISIBILITYMAP_ALL_VISIBLE : 0) |
+			(state->rs_all_frozen ? VISIBILITYMAP_ALL_FROZEN : 0);
+
+	map = PageGetContents(state->rs_vm_buffer);
+	map[mapByte] |= (flags << mapOffset);
+}
+
+/*
+ * Update rs_all_visible and rs_all_frozen flags according to the tuple.  We
+ * use simplified check assuming that HeapTupleSatisfiesVacuum() should already
+ * set tuple hint bits.
+ */
+static void
+rewrite_update_vm_flags(RewriteState state, HeapTuple tuple)
+{
+	TransactionId	xmin;
+
+	if (!state->rs_all_visible)
+		return;
+
+	if (!HeapTupleHeaderXminCommitted(tuple->t_data))
+	{
+		state->rs_all_visible = false;
+		state->rs_all_frozen = false;
+		return;
+	}
+
+	xmin = HeapTupleHeaderGetXmin(tuple->t_data);
+	if (!TransactionIdPrecedes(xmin, state->rs_oldest_xmin))
+	{
+		state->rs_all_visible = false;
+		state->rs_all_frozen = false;
+		return;
+	}
+
+	if (!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID) &&
+		!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask))
+	{
+		state->rs_all_visible = false;
+		state->rs_all_frozen = false;
+		return;
+	}
+
+	if (!state->rs_all_frozen)
+		return;
+
+	if (heap_tuple_needs_eventual_freeze(tuple->t_data))
+		state->rs_all_frozen = false;
+}
+
 /*
  * Add a tuple to the new heap.
  *
@@ -472,6 +593,7 @@ rewrite_heap_tuple(RewriteState state,
 
 		/* Insert the tuple and find out where it's put in new_heap */
 		raw_heap_insert(state, new_tuple);
+		rewrite_update_vm_flags(state, new_tuple);
 		new_tid = new_tuple->t_self;
 
 		logical_rewrite_heap_tuple(state, old_tid, new_tuple);
@@ -677,6 +799,9 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
 			 * enforce saveFreeSpace unconditionally.
 			 */
 
+			if (state->rs_all_visible)
+				PageSetAllVisible(page);
+
 			/* XLOG stuff */
 			if (RelationNeedsWAL(state->rs_new_rel))
 				log_newpage(&state->rs_new_rel->rd_node,
@@ -695,6 +820,8 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
 			smgrextend(RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM,
 					   state->rs_blockno, (char *) page, true);
 
+			rewrite_set_vm_flags(state);
+
 			state->rs_blockno++;
 			state->rs_buffer_valid = false;
 		}
diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
index 114fbbdd30..dd0e686a9f 100644
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -99,24 +99,6 @@
 
 /*#define TRACE_VISIBILITYMAP */
 
-/*
- * Size of the bitmap on each visibility map page, in bytes. There's no
- * extra headers, so the whole page minus the standard page header is
- * used for the bitmap.
- */
-#define MAPSIZE (BLCKSZ - MAXALIGN(SizeOfPageHeaderData))
-
-/* Number of heap blocks we can represent in one byte */
-#define HEAPBLOCKS_PER_BYTE (BITS_PER_BYTE / BITS_PER_HEAPBLOCK)
-
-/* Number of heap blocks we can represent in one visibility map page. */
-#define HEAPBLOCKS_PER_PAGE (MAPSIZE * HEAPBLOCKS_PER_BYTE)
-
-/* Mapping from heap block number to the right bit in the visibility map */
-#define HEAPBLK_TO_MAPBLOCK(x) ((x) / HEAPBLOCKS_PER_PAGE)
-#define HEAPBLK_TO_MAPBYTE(x) (((x) % HEAPBLOCKS_PER_PAGE) / HEAPBLOCKS_PER_BYTE)
-#define HEAPBLK_TO_OFFSET(x) (((x) % HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK)
-
 /* Masks for counting subsets of bits in the visibility map. */
 #define VISIBLE_MASK64	UINT64CONST(0x5555555555555555) /* The lower bit of each
 														 * bit pair */
diff --git a/src/include/access/visibilitymap.h b/src/include/access/visibilitymap.h
index 57362c3687..39f3d59156 100644
--- a/src/include/access/visibilitymap.h
+++ b/src/include/access/visibilitymap.h
@@ -34,6 +34,24 @@
 #define VM_ALL_FROZEN(r, b, v) \
 	((visibilitymap_get_status((r), (b), (v)) & VISIBILITYMAP_ALL_FROZEN) != 0)
 
+/*
+ * Size of the bitmap on each visibility map page, in bytes. There's no
+ * extra headers, so the whole page minus the standard page header is
+ * used for the bitmap.
+ */
+#define MAPSIZE (BLCKSZ - MAXALIGN(SizeOfPageHeaderData))
+
+/* Number of heap blocks we can represent in one byte */
+#define HEAPBLOCKS_PER_BYTE (BITS_PER_BYTE / BITS_PER_HEAPBLOCK)
+
+/* Number of heap blocks we can represent in one visibility map page. */
+#define HEAPBLOCKS_PER_PAGE (MAPSIZE * HEAPBLOCKS_PER_BYTE)
+
+/* Mapping from heap block number to the right bit in the visibility map */
+#define HEAPBLK_TO_MAPBLOCK(x) ((x) / HEAPBLOCKS_PER_PAGE)
+#define HEAPBLK_TO_MAPBYTE(x) (((x) % HEAPBLOCKS_PER_PAGE) / HEAPBLOCKS_PER_BYTE)
+#define HEAPBLK_TO_OFFSET(x) (((x) % HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK)
+
 extern bool visibilitymap_clear(Relation rel, BlockNumber heapBlk,
 								Buffer vmbuf, uint8 flags);
 extern void visibilitymap_pin(Relation rel, BlockNumber heapBlk,
-- 
2.17.0

0002-refactor-the-code-to-make-sure-that-logic-is-not-dup.patchtext/x-diff; charset=us-asciiDownload
From 3454aceee4c1806664b3932e0217fc50d71be52d Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Sun, 29 Aug 2021 16:01:13 -0500
Subject: [PATCH 2/3] refactor the code to make sure that logic is not
 duplicated.

0001-write-vm-during-cluster-4.patch
Date: Mon, 28 Jun 2021 11:22:01 +0300
From: Anna Akenteva <a.akenteva@postgrespro.ru>
---
 src/backend/access/heap/heapam_handler.c | 19 +++---
 src/backend/access/heap/rewriteheap.c    | 12 +++-
 src/backend/access/heap/vacuumlazy.c     | 77 ++++++++++++++++--------
 src/backend/utils/sort/tuplesort.c       |  9 ++-
 src/include/access/heapam.h              |  3 +
 src/include/access/rewriteheap.h         |  4 +-
 src/include/utils/tuplesort.h            |  4 +-
 7 files changed, 88 insertions(+), 40 deletions(-)

diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index beb8f20708..c5d44c10e8 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -47,7 +47,8 @@
 
 static void reform_and_rewrite_tuple(HeapTuple tuple,
 									 Relation OldHeap, Relation NewHeap,
-									 Datum *values, bool *isnull, RewriteState rwstate);
+									 Datum *values, bool *isnull, bool islive,
+									 RewriteState rwstate);
 
 static bool SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer,
 								   HeapTuple tuple,
@@ -778,6 +779,7 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
 		HeapTuple	tuple;
 		Buffer		buf;
 		bool		isdead;
+		bool		islive = false;
 
 		CHECK_FOR_INTERRUPTS();
 
@@ -844,6 +846,7 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
 			case HEAPTUPLE_LIVE:
 				/* Live or recently dead, must copy it */
 				isdead = false;
+				islive = true;
 				break;
 			case HEAPTUPLE_INSERT_IN_PROGRESS:
 
@@ -899,7 +902,7 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
 		*num_tuples += 1;
 		if (tuplesort != NULL)
 		{
-			tuplesort_putheaptuple(tuplesort, tuple);
+			tuplesort_putheaptuple(tuplesort, tuple, islive);
 
 			/*
 			 * In scan-and-sort mode, report increase in number of tuples
@@ -917,7 +920,7 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
 			int64		ct_val[2];
 
 			reform_and_rewrite_tuple(tuple, OldHeap, NewHeap,
-									 values, isnull, rwstate);
+									 values, isnull, islive, rwstate);
 
 			/*
 			 * In indexscan mode and also VACUUM FULL, report increase in
@@ -957,17 +960,18 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
 		for (;;)
 		{
 			HeapTuple	tuple;
+			bool		islive;
 
 			CHECK_FOR_INTERRUPTS();
 
-			tuple = tuplesort_getheaptuple(tuplesort, true);
+			tuple = tuplesort_getheaptuple(tuplesort, true, &islive);
 			if (tuple == NULL)
 				break;
 
 			n_tuples += 1;
 			reform_and_rewrite_tuple(tuple,
 									 OldHeap, NewHeap,
-									 values, isnull,
+									 values, isnull, islive,
 									 rwstate);
 			/* Report n_tuples */
 			pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_TUPLES_WRITTEN,
@@ -2454,7 +2458,8 @@ heapam_scan_sample_next_tuple(TableScanDesc scan, SampleScanState *scanstate,
 static void
 reform_and_rewrite_tuple(HeapTuple tuple,
 						 Relation OldHeap, Relation NewHeap,
-						 Datum *values, bool *isnull, RewriteState rwstate)
+						 Datum *values, bool *isnull, bool islive,
+						 RewriteState rwstate)
 {
 	TupleDesc	oldTupDesc = RelationGetDescr(OldHeap);
 	TupleDesc	newTupDesc = RelationGetDescr(NewHeap);
@@ -2473,7 +2478,7 @@ reform_and_rewrite_tuple(HeapTuple tuple,
 	copiedTuple = heap_form_tuple(newTupDesc, values, isnull);
 
 	/* The heap rewrite module does the rest */
-	rewrite_heap_tuple(rwstate, tuple, copiedTuple);
+	rewrite_heap_tuple(rwstate, islive, tuple, copiedTuple);
 
 	heap_freetuple(copiedTuple);
 }
diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index af0d1b2d0c..623149ac26 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -424,6 +424,8 @@ rewrite_set_vm_flags(RewriteState state)
 	map[mapByte] |= (flags << mapOffset);
 }
 
+// rewrite_update_vm_flags(state, new_tuple);
+
 /*
  * Update rs_all_visible and rs_all_frozen flags according to the tuple.  We
  * use simplified check assuming that HeapTupleSatisfiesVacuum() should already
@@ -475,11 +477,12 @@ rewrite_update_vm_flags(RewriteState state, HeapTuple tuple)
  * it had better be temp storage not a pointer to the original tuple.
  *
  * state		opaque state as returned by begin_heap_rewrite
+ * is_live		whether the currently processed tuple is live
  * old_tuple	original tuple in the old heap
  * new_tuple	new, rewritten tuple to be inserted to new heap
  */
 void
-rewrite_heap_tuple(RewriteState state,
+rewrite_heap_tuple(RewriteState state, bool is_live,
 				   HeapTuple old_tuple, HeapTuple new_tuple)
 {
 	MemoryContext old_cxt;
@@ -593,7 +596,12 @@ rewrite_heap_tuple(RewriteState state,
 
 		/* Insert the tuple and find out where it's put in new_heap */
 		raw_heap_insert(state, new_tuple);
-		rewrite_update_vm_flags(state, new_tuple);
+
+		heap_page_update_vm_flags(new_tuple, state->rs_oldest_xmin,
+								 is_live, true,
+								 &(state->rs_all_visible),
+								 &(state->rs_all_frozen));
+
 		new_tid = new_tuple->t_self;
 
 		logical_rewrite_heap_tuple(state, old_tid, new_tuple);
diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index 2589f80a52..97e1603f27 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -3611,6 +3611,53 @@ vac_cmp_itemptr(const void *left, const void *right)
 	return 0;
 }
 
+/*
+ * Update all_visible and all_frozen flags according to the tuple.  We
+ * use simplified check assuming that HeapTupleSatisfiesVacuum() should already
+ * set tuple hint bits.
+ */
+void
+heap_page_update_vm_flags(HeapTuple tuple, TransactionId OldestXmin,
+						  bool is_live, bool check_infomask,
+						  bool *all_visible, bool *all_frozen)
+{
+	TransactionId xmin;
+
+	/* Check comments in lazy_scan_heap. */
+	if (!HeapTupleHeaderXminCommitted(tuple->t_data) || !is_live)
+	{
+		*all_visible = false;
+		*all_frozen = false;
+		return;
+	}
+
+	/*
+	 * The inserter definitely committed. But is it old enough
+	 * that everyone sees it as committed?
+	 */
+	xmin = HeapTupleHeaderGetXmin(tuple->t_data);
+	if (!TransactionIdPrecedes(xmin, OldestXmin))
+	{
+		*all_visible = false;
+		*all_frozen = false;
+		return;
+	}
+
+	if (check_infomask &&
+		!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID) &&
+		!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask))
+	{
+		*all_visible = false;
+		*all_frozen = false;
+		return;
+	}
+
+	/* Check whether this tuple is already frozen or not */
+	if (*all_visible && *all_frozen &&
+		heap_tuple_needs_eventual_freeze(tuple->t_data))
+		*all_frozen = false;
+}
+
 /*
  * Check if every tuple in the given page is visible to all current and future
  * transactions. Also return the visibility_cutoff_xid which is the highest
@@ -3679,36 +3726,16 @@ heap_page_is_all_visible(LVRelState *vacrel, Buffer buf,
 				{
 					TransactionId xmin;
 
-					/* Check comments in lazy_scan_heap. */
-					if (!HeapTupleHeaderXminCommitted(tuple.t_data))
-					{
-						all_visible = false;
-						*all_frozen = false;
-						break;
-					}
-
-					/*
-					 * The inserter definitely committed. But is it old enough
-					 * that everyone sees it as committed?
-					 */
-					xmin = HeapTupleHeaderGetXmin(tuple.t_data);
-					if (!TransactionIdPrecedes(xmin, vacrel->OldestXmin))
-					{
-						all_visible = false;
-						*all_frozen = false;
-						break;
-					}
+					heap_page_update_vm_flags(&tuple, vacrel->OldestXmin,
+											  true, false,
+											  &all_visible, all_frozen);
 
 					/* Track newest xmin on page. */
+					xmin = HeapTupleHeaderGetXmin(tuple.t_data);
 					if (TransactionIdFollows(xmin, *visibility_cutoff_xid))
 						*visibility_cutoff_xid = xmin;
-
-					/* Check whether this tuple is already frozen or not */
-					if (all_visible && *all_frozen &&
-						heap_tuple_needs_eventual_freeze(tuple.t_data))
-						*all_frozen = false;
+					break;
 				}
-				break;
 
 			case HEAPTUPLE_DEAD:
 			case HEAPTUPLE_RECENTLY_DEAD:
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index b17347b214..d5a0491bf8 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -180,6 +180,7 @@ typedef struct
 	Datum		datum1;			/* value of first key column */
 	bool		isnull1;		/* is first key column NULL? */
 	int			srctape;		/* source tape number */
+	bool		is_live;		/* is the tuple alive? */
 } SortTuple;
 
 /*
@@ -1703,7 +1704,7 @@ tuplesort_puttupleslot(Tuplesortstate *state, TupleTableSlot *slot)
  * Note that the input data is always copied; the caller need not save it.
  */
 void
-tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup)
+tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup, bool is_live)
 {
 	MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
 	SortTuple	stup;
@@ -1715,6 +1716,7 @@ tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup)
 	COPYTUP(state, &stup, (void *) tup);
 
 	puttuple_common(state, &stup);
+	stup.is_live = is_live;
 
 	MemoryContextSwitchTo(oldcontext);
 }
@@ -2442,7 +2444,7 @@ tuplesort_gettupleslot(Tuplesortstate *state, bool forward, bool copy,
  * remaining valid after any further manipulation of tuplesort.
  */
 HeapTuple
-tuplesort_getheaptuple(Tuplesortstate *state, bool forward)
+tuplesort_getheaptuple(Tuplesortstate *state, bool forward, bool *is_live)
 {
 	MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
 	SortTuple	stup;
@@ -2452,6 +2454,9 @@ tuplesort_getheaptuple(Tuplesortstate *state, bool forward)
 
 	MemoryContextSwitchTo(oldcontext);
 
+	if (is_live)
+		*is_live = stup.is_live;
+
 	return stup.tuple;
 }
 
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index e63b49fc38..71bda855f2 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -199,6 +199,9 @@ struct VacuumParams;
 extern void heap_vacuum_rel(Relation rel,
 							struct VacuumParams *params, BufferAccessStrategy bstrategy);
 extern void parallel_vacuum_main(dsm_segment *seg, shm_toc *toc);
+extern void heap_page_update_vm_flags(HeapTuple tuple, TransactionId OldestXmin,
+									  bool is_live, bool check_infomask,
+									  bool *all_visible, bool *all_frozen);
 
 /* in heap/heapam_visibility.c */
 extern bool HeapTupleSatisfiesVisibility(HeapTuple stup, Snapshot snapshot,
diff --git a/src/include/access/rewriteheap.h b/src/include/access/rewriteheap.h
index 121f552405..2cda7320d3 100644
--- a/src/include/access/rewriteheap.h
+++ b/src/include/access/rewriteheap.h
@@ -25,8 +25,8 @@ extern RewriteState begin_heap_rewrite(Relation OldHeap, Relation NewHeap,
 									   TransactionId OldestXmin, TransactionId FreezeXid,
 									   MultiXactId MultiXactCutoff);
 extern void end_heap_rewrite(RewriteState state);
-extern void rewrite_heap_tuple(RewriteState state, HeapTuple oldTuple,
-							   HeapTuple newTuple);
+extern void rewrite_heap_tuple(RewriteState state, bool is_live,
+							   HeapTuple oldTuple, HeapTuple newTuple);
 extern bool rewrite_heap_dead_tuple(RewriteState state, HeapTuple oldTuple);
 
 /*
diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h
index f94949370b..45442c4351 100644
--- a/src/include/utils/tuplesort.h
+++ b/src/include/utils/tuplesort.h
@@ -232,7 +232,7 @@ extern bool tuplesort_used_bound(Tuplesortstate *state);
 
 extern void tuplesort_puttupleslot(Tuplesortstate *state,
 								   TupleTableSlot *slot);
-extern void tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup);
+extern void tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup, bool is_live);
 extern void tuplesort_putindextuplevalues(Tuplesortstate *state,
 										  Relation rel, ItemPointer self,
 										  Datum *values, bool *isnull);
@@ -243,7 +243,7 @@ extern void tuplesort_performsort(Tuplesortstate *state);
 
 extern bool tuplesort_gettupleslot(Tuplesortstate *state, bool forward,
 								   bool copy, TupleTableSlot *slot, Datum *abbrev);
-extern HeapTuple tuplesort_getheaptuple(Tuplesortstate *state, bool forward);
+extern HeapTuple tuplesort_getheaptuple(Tuplesortstate *state, bool forward, bool *is_live);
 extern IndexTuple tuplesort_getindextuple(Tuplesortstate *state, bool forward);
 extern bool tuplesort_getdatum(Tuplesortstate *state, bool forward,
 							   Datum *val, bool *isNull, Datum *abbrev);
-- 
2.17.0

0003-cluster-set-relallvisible-num_pages.patchtext/x-diff; charset=us-asciiDownload
From c40fae27093f895bb2262577cd8fc5b289682493 Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Sun, 29 Aug 2021 16:55:38 -0500
Subject: [PATCH 3/3] cluster: set relallvisible=num_pages

---
 src/backend/commands/cluster.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c
index 83e968cfa4..86f772c107 100644
--- a/src/backend/commands/cluster.c
+++ b/src/backend/commands/cluster.c
@@ -1016,6 +1016,7 @@ copy_table_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
 	relform = (Form_pg_class) GETSTRUCT(reltup);
 
 	relform->relpages = num_pages;
+	relform->relallvisible = num_pages;
 	relform->reltuples = num_tuples;
 
 	/* Don't update the stats for pg_class.  See swap_relation_files. */
-- 
2.17.0

#11Daniel Gustafsson
daniel@yesql.se
In reply to: Justin Pryzby (#10)
Re: Write visibility map during CLUSTER/VACUUM FULL

On 30 Aug 2021, at 02:26, Justin Pryzby <pryzby@telsasoft.com> wrote:

Rebased on f10f0ae420ee62400876ab34dca2c09c20dcd030.

This patch no longer applies, please submit a rebased version.

--
Daniel Gustafsson https://vmware.com/

#12Justin Pryzby
pryzby@telsasoft.com
In reply to: Justin Pryzby (#10)
3 attachment(s)
Re: Write visibility map during CLUSTER/VACUUM FULL

On Sun, Aug 29, 2021 at 07:26:42PM -0500, Justin Pryzby wrote:

On Mon, Jun 28, 2021 at 11:22:01AM +0300, Anna Akenteva wrote:

On 2019-11-29 05:32, Michael Paquier wrote:

These comments are unanswered for more than 2 months, so I am marking
this entry as returned with feedback.

I'd like to revive this patch. To make further work easier, attaching a
rebased version of the latest patch by Alexander.

As for having yet another copy of logic determining visibility: the
visibility check was mainly taken from heap_page_is_all_visible(), so I
refactored the code to make sure that logic is not duplicated. The updated
patch is attached too.

I agree that it's strange that VACUUM(FREEZE) freezes tuples but not VM bits,
nor page-level PD_ALL_VISIBLE (which is implied by all frozen). After FULL
runs (taking an exclusive lock on the table), it's necessary to then vacuum the
table again to get what's intended.

Rebased on f10f0ae420ee62400876ab34dca2c09c20dcd030.

Rebased this patch again

Alexander, are you planning on working on this patch ?

Otherwise, Anna, would you want to "own" the patch ?

Show quoted text

And rephrased Anna's two independent/alternate patches as a 2nd patch on top of
the 1st, as that helps me to read it and reduces its total size.

I noticed in testing the patch that autovacuum is still hitting the relation
shortly after vacuum full. This is due to n_ins_since_autovacuum, which is new
in pg13. I don't know how to address that (or even if it should be addressed
at all).

Also, pg_class.relallvisible is not set unless vacuum/analyze is run again
(which is mitigated by the n_ins behavior above). It seems like this might be
important: an plan which uses index-only scan might be missed in favour of
something else until autovacuum runs (it will use cost-based delay, and might
not be scheduled immediately, could be interrupted, or even diasbled).

I'm testing like this:
CREATE EXTENSION IF NOT EXISTS pg_visibility ; DROP TABLE t; CREATE TABLE t AS SELECT generate_series(1,99999); VACUUM FULL t; ANALYZE t; SELECT n_ins_since_vacuum, n_tup_upd, n_tup_del, n_tup_hot_upd FROM pg_stat_all_tables WHERE relname='t'; SELECT relpages, relallvisible FROM pg_class WHERE oid='t'::regclass; SELECT COUNT(1), COUNT(1)FILTER(WHERE all_visible='t') allvis, COUNT(1)FILTER(WHERE all_frozen='t') allfrz, COUNT(1)FILTER(WHERE pd_all_visible='t') allvispd FROM pg_visibility('t');

Attachments:

v2-0001-VACUUM-FULL-CLUSTER-set-VM-and-page-level-visibil.patchtext/x-diff; charset=us-asciiDownload
From eb04b296c95a20c2f1b8ac670de91fd93e3d7a33 Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Sun, 29 Aug 2021 15:57:09 -0500
Subject: [PATCH v2 1/3] VACUUM FULL/CLUSTER: set VM and page-level visibility
 bits

From: 	Alexander Korotkov
write-vm-during-cluster-3-rebase
---
 src/backend/access/heap/rewriteheap.c   | 131 +++++++++++++++++++++++-
 src/backend/access/heap/visibilitymap.c |  18 ----
 src/include/access/visibilitymap.h      |  18 ++++
 3 files changed, 147 insertions(+), 20 deletions(-)

diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index 986a776bbd..af0d1b2d0c 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -110,6 +110,7 @@
 #include "access/heaptoast.h"
 #include "access/rewriteheap.h"
 #include "access/transam.h"
+#include "access/visibilitymap.h"
 #include "access/xact.h"
 #include "access/xloginsert.h"
 #include "catalog/catalog.h"
@@ -152,6 +153,11 @@ typedef struct RewriteStateData
 	HTAB	   *rs_old_new_tid_map; /* unmatched B tuples */
 	HTAB	   *rs_logical_mappings;	/* logical remapping files */
 	uint32		rs_num_rewrite_mappings;	/* # in memory mappings */
+	Page		rs_vm_buffer;	/* visibility map page */
+	BlockNumber rs_vm_blockno;	/* block number of visibility page */
+	BlockNumber rs_vm_buffer_valid;	/* T if any bits are set */
+	bool		rs_all_visible;	/* all visible flag for rs_buffer */
+	bool		rs_all_frozen;	/* all frozen flag for rs_buffer */
 }			RewriteStateData;
 
 /*
@@ -213,6 +219,9 @@ typedef struct RewriteMappingDataEntry
 
 
 /* prototypes for internal functions */
+static void rewrite_flush_vm_page(RewriteState state);
+static void rewrite_set_vm_flags(RewriteState state);
+static void rewrite_update_vm_flags(RewriteState state, HeapTuple tuple);
 static void raw_heap_insert(RewriteState state, HeapTuple tup);
 
 /* internal logical remapping prototypes */
@@ -264,6 +273,11 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm
 	state->rs_freeze_xid = freeze_xid;
 	state->rs_cutoff_multi = cutoff_multi;
 	state->rs_cxt = rw_cxt;
+	state->rs_vm_buffer = (Page) palloc(BLCKSZ);
+	state->rs_vm_blockno = HEAPBLK_TO_MAPBLOCK(state->rs_blockno);
+	state->rs_vm_buffer_valid = false;
+	state->rs_all_visible = true;
+	state->rs_all_frozen = true;
 
 	/* Initialize hash tables used to track update chains */
 	hash_ctl.keysize = sizeof(TidHashKey);
@@ -284,6 +298,9 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm
 					&hash_ctl,
 					HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
 
+	if (!smgrexists(RelationGetSmgr(state->rs_new_rel), VISIBILITYMAP_FORKNUM))
+		smgrcreate(RelationGetSmgr(state->rs_new_rel), VISIBILITYMAP_FORKNUM, false);
+
 	MemoryContextSwitchTo(old_cxt);
 
 	logical_begin_heap_rewrite(state);
@@ -317,6 +334,9 @@ end_heap_rewrite(RewriteState state)
 	/* Write the last page, if any */
 	if (state->rs_buffer_valid)
 	{
+		if (state->rs_all_visible)
+			PageSetAllVisible(state->rs_buffer);
+
 		if (RelationNeedsWAL(state->rs_new_rel))
 			log_newpage(&state->rs_new_rel->rd_node,
 						MAIN_FORKNUM,
@@ -326,10 +346,16 @@ end_heap_rewrite(RewriteState state)
 
 		PageSetChecksumInplace(state->rs_buffer, state->rs_blockno);
 
-		smgrextend(RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM,
-				   state->rs_blockno, (char *) state->rs_buffer, true);
+		smgrextend(RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM, state->rs_blockno,
+				   (char *) state->rs_buffer, true);
+
+		rewrite_set_vm_flags(state);
 	}
 
+	/* Write the last VM page too */
+	if (state->rs_vm_buffer_valid)
+		rewrite_flush_vm_page(state);
+
 	/*
 	 * When we WAL-logged rel pages, we must nonetheless fsync them.  The
 	 * reason is the same as in storage.c's RelationCopyStorage(): we're
@@ -338,7 +364,11 @@ end_heap_rewrite(RewriteState state)
 	 * wrote before the checkpoint.
 	 */
 	if (RelationNeedsWAL(state->rs_new_rel))
+	{
+		/* for an empty table, this could be first smgr access */
 		smgrimmedsync(RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM);
+		smgrimmedsync(RelationGetSmgr(state->rs_new_rel), VISIBILITYMAP_FORKNUM);
+	}
 
 	logical_end_heap_rewrite(state);
 
@@ -346,6 +376,97 @@ end_heap_rewrite(RewriteState state)
 	MemoryContextDelete(state->rs_cxt);
 }
 
+/* Write contents of VM page */
+static void
+rewrite_flush_vm_page(RewriteState state)
+{
+	Assert(state->rs_vm_buffer_valid);
+
+	if (RelationNeedsWAL(state->rs_new_rel))
+		log_newpage(&state->rs_new_rel->rd_node,
+					VISIBILITYMAP_FORKNUM,
+					state->rs_vm_blockno,
+					state->rs_vm_buffer,
+					true);
+
+	PageSetChecksumInplace(state->rs_vm_buffer, state->rs_vm_blockno);
+
+	smgrextend(RelationGetSmgr(state->rs_new_rel), VISIBILITYMAP_FORKNUM,
+			   state->rs_vm_blockno, (char *) state->rs_vm_buffer, true);
+
+	state->rs_vm_buffer_valid = false;
+}
+
+/* Set VM flags to the VM page */
+static void
+rewrite_set_vm_flags(RewriteState state)
+{
+	BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(state->rs_blockno);
+	uint32		mapByte = HEAPBLK_TO_MAPBYTE(state->rs_blockno);
+	uint8		mapOffset = HEAPBLK_TO_OFFSET(state->rs_blockno);
+	char	   *map;
+	uint8		flags;
+
+	if (mapBlock != state->rs_vm_blockno && state->rs_vm_buffer_valid)
+		rewrite_flush_vm_page(state);
+
+	if (!state->rs_vm_buffer_valid)
+	{
+		PageInit(state->rs_vm_buffer, BLCKSZ, 0);
+		state->rs_vm_blockno = mapBlock;
+		state->rs_vm_buffer_valid = true;
+	}
+
+	flags = (state->rs_all_visible ? VISIBILITYMAP_ALL_VISIBLE : 0) |
+			(state->rs_all_frozen ? VISIBILITYMAP_ALL_FROZEN : 0);
+
+	map = PageGetContents(state->rs_vm_buffer);
+	map[mapByte] |= (flags << mapOffset);
+}
+
+/*
+ * Update rs_all_visible and rs_all_frozen flags according to the tuple.  We
+ * use simplified check assuming that HeapTupleSatisfiesVacuum() should already
+ * set tuple hint bits.
+ */
+static void
+rewrite_update_vm_flags(RewriteState state, HeapTuple tuple)
+{
+	TransactionId	xmin;
+
+	if (!state->rs_all_visible)
+		return;
+
+	if (!HeapTupleHeaderXminCommitted(tuple->t_data))
+	{
+		state->rs_all_visible = false;
+		state->rs_all_frozen = false;
+		return;
+	}
+
+	xmin = HeapTupleHeaderGetXmin(tuple->t_data);
+	if (!TransactionIdPrecedes(xmin, state->rs_oldest_xmin))
+	{
+		state->rs_all_visible = false;
+		state->rs_all_frozen = false;
+		return;
+	}
+
+	if (!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID) &&
+		!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask))
+	{
+		state->rs_all_visible = false;
+		state->rs_all_frozen = false;
+		return;
+	}
+
+	if (!state->rs_all_frozen)
+		return;
+
+	if (heap_tuple_needs_eventual_freeze(tuple->t_data))
+		state->rs_all_frozen = false;
+}
+
 /*
  * Add a tuple to the new heap.
  *
@@ -472,6 +593,7 @@ rewrite_heap_tuple(RewriteState state,
 
 		/* Insert the tuple and find out where it's put in new_heap */
 		raw_heap_insert(state, new_tuple);
+		rewrite_update_vm_flags(state, new_tuple);
 		new_tid = new_tuple->t_self;
 
 		logical_rewrite_heap_tuple(state, old_tid, new_tuple);
@@ -677,6 +799,9 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
 			 * enforce saveFreeSpace unconditionally.
 			 */
 
+			if (state->rs_all_visible)
+				PageSetAllVisible(page);
+
 			/* XLOG stuff */
 			if (RelationNeedsWAL(state->rs_new_rel))
 				log_newpage(&state->rs_new_rel->rd_node,
@@ -695,6 +820,8 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
 			smgrextend(RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM,
 					   state->rs_blockno, (char *) page, true);
 
+			rewrite_set_vm_flags(state);
+
 			state->rs_blockno++;
 			state->rs_buffer_valid = false;
 		}
diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
index 114fbbdd30..dd0e686a9f 100644
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -99,24 +99,6 @@
 
 /*#define TRACE_VISIBILITYMAP */
 
-/*
- * Size of the bitmap on each visibility map page, in bytes. There's no
- * extra headers, so the whole page minus the standard page header is
- * used for the bitmap.
- */
-#define MAPSIZE (BLCKSZ - MAXALIGN(SizeOfPageHeaderData))
-
-/* Number of heap blocks we can represent in one byte */
-#define HEAPBLOCKS_PER_BYTE (BITS_PER_BYTE / BITS_PER_HEAPBLOCK)
-
-/* Number of heap blocks we can represent in one visibility map page. */
-#define HEAPBLOCKS_PER_PAGE (MAPSIZE * HEAPBLOCKS_PER_BYTE)
-
-/* Mapping from heap block number to the right bit in the visibility map */
-#define HEAPBLK_TO_MAPBLOCK(x) ((x) / HEAPBLOCKS_PER_PAGE)
-#define HEAPBLK_TO_MAPBYTE(x) (((x) % HEAPBLOCKS_PER_PAGE) / HEAPBLOCKS_PER_BYTE)
-#define HEAPBLK_TO_OFFSET(x) (((x) % HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK)
-
 /* Masks for counting subsets of bits in the visibility map. */
 #define VISIBLE_MASK64	UINT64CONST(0x5555555555555555) /* The lower bit of each
 														 * bit pair */
diff --git a/src/include/access/visibilitymap.h b/src/include/access/visibilitymap.h
index 0981b218ea..c902900d89 100644
--- a/src/include/access/visibilitymap.h
+++ b/src/include/access/visibilitymap.h
@@ -26,6 +26,24 @@
 #define VM_ALL_FROZEN(r, b, v) \
 	((visibilitymap_get_status((r), (b), (v)) & VISIBILITYMAP_ALL_FROZEN) != 0)
 
+/*
+ * Size of the bitmap on each visibility map page, in bytes. There's no
+ * extra headers, so the whole page minus the standard page header is
+ * used for the bitmap.
+ */
+#define MAPSIZE (BLCKSZ - MAXALIGN(SizeOfPageHeaderData))
+
+/* Number of heap blocks we can represent in one byte */
+#define HEAPBLOCKS_PER_BYTE (BITS_PER_BYTE / BITS_PER_HEAPBLOCK)
+
+/* Number of heap blocks we can represent in one visibility map page. */
+#define HEAPBLOCKS_PER_PAGE (MAPSIZE * HEAPBLOCKS_PER_BYTE)
+
+/* Mapping from heap block number to the right bit in the visibility map */
+#define HEAPBLK_TO_MAPBLOCK(x) ((x) / HEAPBLOCKS_PER_PAGE)
+#define HEAPBLK_TO_MAPBYTE(x) (((x) % HEAPBLOCKS_PER_PAGE) / HEAPBLOCKS_PER_BYTE)
+#define HEAPBLK_TO_OFFSET(x) (((x) % HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK)
+
 extern bool visibilitymap_clear(Relation rel, BlockNumber heapBlk,
 								Buffer vmbuf, uint8 flags);
 extern void visibilitymap_pin(Relation rel, BlockNumber heapBlk,
-- 
2.17.0

v2-0002-refactor-the-code-to-make-sure-that-logic-is-not-.patchtext/x-diff; charset=us-asciiDownload
From 37b1af07998a249eb6b254ae420a0440d67d3de4 Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Sun, 29 Aug 2021 16:01:13 -0500
Subject: [PATCH v2 2/3] refactor the code to make sure that logic is not
 duplicated.

0001-write-vm-during-cluster-4.patch
Date: Mon, 28 Jun 2021 11:22:01 +0300
From: Anna Akenteva <a.akenteva@postgrespro.ru>
---
 src/backend/access/heap/heapam_handler.c | 19 +++---
 src/backend/access/heap/rewriteheap.c    | 12 +++-
 src/backend/access/heap/vacuumlazy.c     | 75 ++++++++++++++++--------
 src/backend/utils/sort/tuplesort.c       |  9 ++-
 src/include/access/heapam.h              |  3 +
 src/include/access/rewriteheap.h         |  4 +-
 src/include/utils/tuplesort.h            |  4 +-
 7 files changed, 87 insertions(+), 39 deletions(-)

diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index 9befe012a9..90361080d0 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -47,7 +47,8 @@
 
 static void reform_and_rewrite_tuple(HeapTuple tuple,
 									 Relation OldHeap, Relation NewHeap,
-									 Datum *values, bool *isnull, RewriteState rwstate);
+									 Datum *values, bool *isnull, bool islive,
+									 RewriteState rwstate);
 
 static bool SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer,
 								   HeapTuple tuple,
@@ -782,6 +783,7 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
 		HeapTuple	tuple;
 		Buffer		buf;
 		bool		isdead;
+		bool		islive = false;
 
 		CHECK_FOR_INTERRUPTS();
 
@@ -848,6 +850,7 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
 			case HEAPTUPLE_LIVE:
 				/* Live or recently dead, must copy it */
 				isdead = false;
+				islive = true;
 				break;
 			case HEAPTUPLE_INSERT_IN_PROGRESS:
 
@@ -903,7 +906,7 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
 		*num_tuples += 1;
 		if (tuplesort != NULL)
 		{
-			tuplesort_putheaptuple(tuplesort, tuple);
+			tuplesort_putheaptuple(tuplesort, tuple, islive);
 
 			/*
 			 * In scan-and-sort mode, report increase in number of tuples
@@ -921,7 +924,7 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
 			int64		ct_val[2];
 
 			reform_and_rewrite_tuple(tuple, OldHeap, NewHeap,
-									 values, isnull, rwstate);
+									 values, isnull, islive, rwstate);
 
 			/*
 			 * In indexscan mode and also VACUUM FULL, report increase in
@@ -961,17 +964,18 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
 		for (;;)
 		{
 			HeapTuple	tuple;
+			bool		islive;
 
 			CHECK_FOR_INTERRUPTS();
 
-			tuple = tuplesort_getheaptuple(tuplesort, true);
+			tuple = tuplesort_getheaptuple(tuplesort, true, &islive);
 			if (tuple == NULL)
 				break;
 
 			n_tuples += 1;
 			reform_and_rewrite_tuple(tuple,
 									 OldHeap, NewHeap,
-									 values, isnull,
+									 values, isnull, islive,
 									 rwstate);
 			/* Report n_tuples */
 			pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_TUPLES_WRITTEN,
@@ -2458,7 +2462,8 @@ heapam_scan_sample_next_tuple(TableScanDesc scan, SampleScanState *scanstate,
 static void
 reform_and_rewrite_tuple(HeapTuple tuple,
 						 Relation OldHeap, Relation NewHeap,
-						 Datum *values, bool *isnull, RewriteState rwstate)
+						 Datum *values, bool *isnull, bool islive,
+						 RewriteState rwstate)
 {
 	TupleDesc	oldTupDesc = RelationGetDescr(OldHeap);
 	TupleDesc	newTupDesc = RelationGetDescr(NewHeap);
@@ -2477,7 +2482,7 @@ reform_and_rewrite_tuple(HeapTuple tuple,
 	copiedTuple = heap_form_tuple(newTupDesc, values, isnull);
 
 	/* The heap rewrite module does the rest */
-	rewrite_heap_tuple(rwstate, tuple, copiedTuple);
+	rewrite_heap_tuple(rwstate, islive, tuple, copiedTuple);
 
 	heap_freetuple(copiedTuple);
 }
diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index af0d1b2d0c..623149ac26 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -424,6 +424,8 @@ rewrite_set_vm_flags(RewriteState state)
 	map[mapByte] |= (flags << mapOffset);
 }
 
+// rewrite_update_vm_flags(state, new_tuple);
+
 /*
  * Update rs_all_visible and rs_all_frozen flags according to the tuple.  We
  * use simplified check assuming that HeapTupleSatisfiesVacuum() should already
@@ -475,11 +477,12 @@ rewrite_update_vm_flags(RewriteState state, HeapTuple tuple)
  * it had better be temp storage not a pointer to the original tuple.
  *
  * state		opaque state as returned by begin_heap_rewrite
+ * is_live		whether the currently processed tuple is live
  * old_tuple	original tuple in the old heap
  * new_tuple	new, rewritten tuple to be inserted to new heap
  */
 void
-rewrite_heap_tuple(RewriteState state,
+rewrite_heap_tuple(RewriteState state, bool is_live,
 				   HeapTuple old_tuple, HeapTuple new_tuple)
 {
 	MemoryContext old_cxt;
@@ -593,7 +596,12 @@ rewrite_heap_tuple(RewriteState state,
 
 		/* Insert the tuple and find out where it's put in new_heap */
 		raw_heap_insert(state, new_tuple);
-		rewrite_update_vm_flags(state, new_tuple);
+
+		heap_page_update_vm_flags(new_tuple, state->rs_oldest_xmin,
+								 is_live, true,
+								 &(state->rs_all_visible),
+								 &(state->rs_all_frozen));
+
 		new_tid = new_tuple->t_self;
 
 		logical_rewrite_heap_tuple(state, old_tid, new_tuple);
diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index 558cc88a08..25ac3e7b51 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -3605,6 +3605,53 @@ vac_cmp_itemptr(const void *left, const void *right)
 	return 0;
 }
 
+/*
+ * Update all_visible and all_frozen flags according to the tuple.  We
+ * use simplified check assuming that HeapTupleSatisfiesVacuum() should already
+ * set tuple hint bits.
+ */
+void
+heap_page_update_vm_flags(HeapTuple tuple, TransactionId OldestXmin,
+						  bool is_live, bool check_infomask,
+						  bool *all_visible, bool *all_frozen)
+{
+	TransactionId xmin;
+
+	/* Check comments in lazy_scan_prune. */
+	if (!HeapTupleHeaderXminCommitted(tuple->t_data) || !is_live)
+	{
+		*all_visible = false;
+		*all_frozen = false;
+		return;
+	}
+
+	/*
+	 * The inserter definitely committed. But is it old enough
+	 * that everyone sees it as committed?
+	 */
+	xmin = HeapTupleHeaderGetXmin(tuple->t_data);
+	if (!TransactionIdPrecedes(xmin, OldestXmin))
+	{
+		*all_visible = false;
+		*all_frozen = false;
+		return;
+	}
+
+	if (check_infomask &&
+		!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID) &&
+		!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask))
+	{
+		*all_visible = false;
+		*all_frozen = false;
+		return;
+	}
+
+	/* Check whether this tuple is already frozen or not */
+	if (*all_visible && *all_frozen &&
+		heap_tuple_needs_eventual_freeze(tuple->t_data))
+		*all_frozen = false;
+}
+
 /*
  * Check if every tuple in the given page is visible to all current and future
  * transactions. Also return the visibility_cutoff_xid which is the highest
@@ -3673,34 +3720,14 @@ heap_page_is_all_visible(LVRelState *vacrel, Buffer buf,
 				{
 					TransactionId xmin;
 
-					/* Check comments in lazy_scan_prune. */
-					if (!HeapTupleHeaderXminCommitted(tuple.t_data))
-					{
-						all_visible = false;
-						*all_frozen = false;
-						break;
-					}
-
-					/*
-					 * The inserter definitely committed. But is it old enough
-					 * that everyone sees it as committed?
-					 */
-					xmin = HeapTupleHeaderGetXmin(tuple.t_data);
-					if (!TransactionIdPrecedes(xmin, vacrel->OldestXmin))
-					{
-						all_visible = false;
-						*all_frozen = false;
-						break;
-					}
+					heap_page_update_vm_flags(&tuple, vacrel->OldestXmin,
+											  true, false,
+											  &all_visible, all_frozen);
 
 					/* Track newest xmin on page. */
+					xmin = HeapTupleHeaderGetXmin(tuple.t_data);
 					if (TransactionIdFollows(xmin, *visibility_cutoff_xid))
 						*visibility_cutoff_xid = xmin;
-
-					/* Check whether this tuple is already frozen or not */
-					if (all_visible && *all_frozen &&
-						heap_tuple_needs_eventual_freeze(tuple.t_data))
-						*all_frozen = false;
 				}
 				break;
 
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index 90e26745df..97b783e4a1 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -184,6 +184,7 @@ typedef struct
 	Datum		datum1;			/* value of first key column */
 	bool		isnull1;		/* is first key column NULL? */
 	int			srctape;		/* source tape number */
+	bool		is_live;		/* is the tuple alive? */
 } SortTuple;
 
 /*
@@ -1706,7 +1707,7 @@ tuplesort_puttupleslot(Tuplesortstate *state, TupleTableSlot *slot)
  * Note that the input data is always copied; the caller need not save it.
  */
 void
-tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup)
+tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup, bool is_live)
 {
 	MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
 	SortTuple	stup;
@@ -1718,6 +1719,7 @@ tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup)
 	COPYTUP(state, &stup, (void *) tup);
 
 	puttuple_common(state, &stup);
+	stup.is_live = is_live;
 
 	MemoryContextSwitchTo(oldcontext);
 }
@@ -2442,7 +2444,7 @@ tuplesort_gettupleslot(Tuplesortstate *state, bool forward, bool copy,
  * remaining valid after any further manipulation of tuplesort.
  */
 HeapTuple
-tuplesort_getheaptuple(Tuplesortstate *state, bool forward)
+tuplesort_getheaptuple(Tuplesortstate *state, bool forward, bool *is_live)
 {
 	MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
 	SortTuple	stup;
@@ -2452,6 +2454,9 @@ tuplesort_getheaptuple(Tuplesortstate *state, bool forward)
 
 	MemoryContextSwitchTo(oldcontext);
 
+	if (is_live)
+		*is_live = stup.is_live;
+
 	return stup.tuple;
 }
 
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index e63b49fc38..71bda855f2 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -199,6 +199,9 @@ struct VacuumParams;
 extern void heap_vacuum_rel(Relation rel,
 							struct VacuumParams *params, BufferAccessStrategy bstrategy);
 extern void parallel_vacuum_main(dsm_segment *seg, shm_toc *toc);
+extern void heap_page_update_vm_flags(HeapTuple tuple, TransactionId OldestXmin,
+									  bool is_live, bool check_infomask,
+									  bool *all_visible, bool *all_frozen);
 
 /* in heap/heapam_visibility.c */
 extern bool HeapTupleSatisfiesVisibility(HeapTuple stup, Snapshot snapshot,
diff --git a/src/include/access/rewriteheap.h b/src/include/access/rewriteheap.h
index 121f552405..2cda7320d3 100644
--- a/src/include/access/rewriteheap.h
+++ b/src/include/access/rewriteheap.h
@@ -25,8 +25,8 @@ extern RewriteState begin_heap_rewrite(Relation OldHeap, Relation NewHeap,
 									   TransactionId OldestXmin, TransactionId FreezeXid,
 									   MultiXactId MultiXactCutoff);
 extern void end_heap_rewrite(RewriteState state);
-extern void rewrite_heap_tuple(RewriteState state, HeapTuple oldTuple,
-							   HeapTuple newTuple);
+extern void rewrite_heap_tuple(RewriteState state, bool is_live,
+							   HeapTuple oldTuple, HeapTuple newTuple);
 extern bool rewrite_heap_dead_tuple(RewriteState state, HeapTuple oldTuple);
 
 /*
diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h
index f94949370b..45442c4351 100644
--- a/src/include/utils/tuplesort.h
+++ b/src/include/utils/tuplesort.h
@@ -232,7 +232,7 @@ extern bool tuplesort_used_bound(Tuplesortstate *state);
 
 extern void tuplesort_puttupleslot(Tuplesortstate *state,
 								   TupleTableSlot *slot);
-extern void tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup);
+extern void tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup, bool is_live);
 extern void tuplesort_putindextuplevalues(Tuplesortstate *state,
 										  Relation rel, ItemPointer self,
 										  Datum *values, bool *isnull);
@@ -243,7 +243,7 @@ extern void tuplesort_performsort(Tuplesortstate *state);
 
 extern bool tuplesort_gettupleslot(Tuplesortstate *state, bool forward,
 								   bool copy, TupleTableSlot *slot, Datum *abbrev);
-extern HeapTuple tuplesort_getheaptuple(Tuplesortstate *state, bool forward);
+extern HeapTuple tuplesort_getheaptuple(Tuplesortstate *state, bool forward, bool *is_live);
 extern IndexTuple tuplesort_getindextuple(Tuplesortstate *state, bool forward);
 extern bool tuplesort_getdatum(Tuplesortstate *state, bool forward,
 							   Datum *val, bool *isNull, Datum *abbrev);
-- 
2.17.0

v2-0003-cluster-set-relallvisible-num_pages.patchtext/x-diff; charset=us-asciiDownload
From de6586495575dd550e74a760f46060c0d51ea1d4 Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Sun, 29 Aug 2021 16:55:38 -0500
Subject: [PATCH v2 3/3] cluster: set relallvisible=num_pages

---
 src/backend/commands/cluster.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c
index 83e968cfa4..86f772c107 100644
--- a/src/backend/commands/cluster.c
+++ b/src/backend/commands/cluster.c
@@ -1016,6 +1016,7 @@ copy_table_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
 	relform = (Form_pg_class) GETSTRUCT(reltup);
 
 	relform->relpages = num_pages;
+	relform->relallvisible = num_pages;
 	relform->reltuples = num_tuples;
 
 	/* Don't update the stats for pg_class.  See swap_relation_files. */
-- 
2.17.0

#13Justin Pryzby
pryzby@telsasoft.com
In reply to: Justin Pryzby (#12)
3 attachment(s)
Re: Write visibility map during CLUSTER/VACUUM FULL

On Mon, Nov 15, 2021 at 04:38:56PM -0600, Justin Pryzby wrote:

On Sun, Aug 29, 2021 at 07:26:42PM -0500, Justin Pryzby wrote:

On Mon, Jun 28, 2021 at 11:22:01AM +0300, Anna Akenteva wrote:

On 2019-11-29 05:32, Michael Paquier wrote:

These comments are unanswered for more than 2 months, so I am marking
this entry as returned with feedback.

I'd like to revive this patch. To make further work easier, attaching a
rebased version of the latest patch by Alexander.

As for having yet another copy of logic determining visibility: the
visibility check was mainly taken from heap_page_is_all_visible(), so I
refactored the code to make sure that logic is not duplicated. The updated
patch is attached too.

I agree that it's strange that VACUUM(FREEZE) freezes tuples but not VM bits,
nor page-level PD_ALL_VISIBLE (which is implied by all frozen). After FULL
runs (taking an exclusive lock on the table), it's necessary to then vacuum the
table again to get what's intended.

Rebased on f10f0ae420ee62400876ab34dca2c09c20dcd030.

Rebased this patch again

Alexander, are you planning on working on this patch ?

Otherwise, Anna, would you want to "own" the patch ?

Rebased on 8e1fae193864527c931a704bd7908e4fbc983f5c.

Would someone step up to "own" this patch ?

If not, its CF entry may need to be closed (there's no status for "needs
author").

Attachments:

v3-0001-VACUUM-FULL-CLUSTER-set-VM-and-page-level-visibil.patchtext/x-diff; charset=us-asciiDownload
From 5e9534b6c6c0676de700ca7c2e06b53912a43874 Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Sun, 29 Aug 2021 15:57:09 -0500
Subject: [PATCH v3 1/3] VACUUM FULL/CLUSTER: set VM and page-level visibility
 bits

From: 	Alexander Korotkov
write-vm-during-cluster-3-rebase
---
 src/backend/access/heap/rewriteheap.c   | 131 +++++++++++++++++++++++-
 src/backend/access/heap/visibilitymap.c |  18 ----
 src/include/access/visibilitymap.h      |  18 ++++
 3 files changed, 147 insertions(+), 20 deletions(-)

diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index 986a776bbd5..af0d1b2d0c8 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -110,6 +110,7 @@
 #include "access/heaptoast.h"
 #include "access/rewriteheap.h"
 #include "access/transam.h"
+#include "access/visibilitymap.h"
 #include "access/xact.h"
 #include "access/xloginsert.h"
 #include "catalog/catalog.h"
@@ -152,6 +153,11 @@ typedef struct RewriteStateData
 	HTAB	   *rs_old_new_tid_map; /* unmatched B tuples */
 	HTAB	   *rs_logical_mappings;	/* logical remapping files */
 	uint32		rs_num_rewrite_mappings;	/* # in memory mappings */
+	Page		rs_vm_buffer;	/* visibility map page */
+	BlockNumber rs_vm_blockno;	/* block number of visibility page */
+	BlockNumber rs_vm_buffer_valid;	/* T if any bits are set */
+	bool		rs_all_visible;	/* all visible flag for rs_buffer */
+	bool		rs_all_frozen;	/* all frozen flag for rs_buffer */
 }			RewriteStateData;
 
 /*
@@ -213,6 +219,9 @@ typedef struct RewriteMappingDataEntry
 
 
 /* prototypes for internal functions */
+static void rewrite_flush_vm_page(RewriteState state);
+static void rewrite_set_vm_flags(RewriteState state);
+static void rewrite_update_vm_flags(RewriteState state, HeapTuple tuple);
 static void raw_heap_insert(RewriteState state, HeapTuple tup);
 
 /* internal logical remapping prototypes */
@@ -264,6 +273,11 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm
 	state->rs_freeze_xid = freeze_xid;
 	state->rs_cutoff_multi = cutoff_multi;
 	state->rs_cxt = rw_cxt;
+	state->rs_vm_buffer = (Page) palloc(BLCKSZ);
+	state->rs_vm_blockno = HEAPBLK_TO_MAPBLOCK(state->rs_blockno);
+	state->rs_vm_buffer_valid = false;
+	state->rs_all_visible = true;
+	state->rs_all_frozen = true;
 
 	/* Initialize hash tables used to track update chains */
 	hash_ctl.keysize = sizeof(TidHashKey);
@@ -284,6 +298,9 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm
 					&hash_ctl,
 					HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
 
+	if (!smgrexists(RelationGetSmgr(state->rs_new_rel), VISIBILITYMAP_FORKNUM))
+		smgrcreate(RelationGetSmgr(state->rs_new_rel), VISIBILITYMAP_FORKNUM, false);
+
 	MemoryContextSwitchTo(old_cxt);
 
 	logical_begin_heap_rewrite(state);
@@ -317,6 +334,9 @@ end_heap_rewrite(RewriteState state)
 	/* Write the last page, if any */
 	if (state->rs_buffer_valid)
 	{
+		if (state->rs_all_visible)
+			PageSetAllVisible(state->rs_buffer);
+
 		if (RelationNeedsWAL(state->rs_new_rel))
 			log_newpage(&state->rs_new_rel->rd_node,
 						MAIN_FORKNUM,
@@ -326,10 +346,16 @@ end_heap_rewrite(RewriteState state)
 
 		PageSetChecksumInplace(state->rs_buffer, state->rs_blockno);
 
-		smgrextend(RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM,
-				   state->rs_blockno, (char *) state->rs_buffer, true);
+		smgrextend(RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM, state->rs_blockno,
+				   (char *) state->rs_buffer, true);
+
+		rewrite_set_vm_flags(state);
 	}
 
+	/* Write the last VM page too */
+	if (state->rs_vm_buffer_valid)
+		rewrite_flush_vm_page(state);
+
 	/*
 	 * When we WAL-logged rel pages, we must nonetheless fsync them.  The
 	 * reason is the same as in storage.c's RelationCopyStorage(): we're
@@ -338,7 +364,11 @@ end_heap_rewrite(RewriteState state)
 	 * wrote before the checkpoint.
 	 */
 	if (RelationNeedsWAL(state->rs_new_rel))
+	{
+		/* for an empty table, this could be first smgr access */
 		smgrimmedsync(RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM);
+		smgrimmedsync(RelationGetSmgr(state->rs_new_rel), VISIBILITYMAP_FORKNUM);
+	}
 
 	logical_end_heap_rewrite(state);
 
@@ -346,6 +376,97 @@ end_heap_rewrite(RewriteState state)
 	MemoryContextDelete(state->rs_cxt);
 }
 
+/* Write contents of VM page */
+static void
+rewrite_flush_vm_page(RewriteState state)
+{
+	Assert(state->rs_vm_buffer_valid);
+
+	if (RelationNeedsWAL(state->rs_new_rel))
+		log_newpage(&state->rs_new_rel->rd_node,
+					VISIBILITYMAP_FORKNUM,
+					state->rs_vm_blockno,
+					state->rs_vm_buffer,
+					true);
+
+	PageSetChecksumInplace(state->rs_vm_buffer, state->rs_vm_blockno);
+
+	smgrextend(RelationGetSmgr(state->rs_new_rel), VISIBILITYMAP_FORKNUM,
+			   state->rs_vm_blockno, (char *) state->rs_vm_buffer, true);
+
+	state->rs_vm_buffer_valid = false;
+}
+
+/* Set VM flags to the VM page */
+static void
+rewrite_set_vm_flags(RewriteState state)
+{
+	BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(state->rs_blockno);
+	uint32		mapByte = HEAPBLK_TO_MAPBYTE(state->rs_blockno);
+	uint8		mapOffset = HEAPBLK_TO_OFFSET(state->rs_blockno);
+	char	   *map;
+	uint8		flags;
+
+	if (mapBlock != state->rs_vm_blockno && state->rs_vm_buffer_valid)
+		rewrite_flush_vm_page(state);
+
+	if (!state->rs_vm_buffer_valid)
+	{
+		PageInit(state->rs_vm_buffer, BLCKSZ, 0);
+		state->rs_vm_blockno = mapBlock;
+		state->rs_vm_buffer_valid = true;
+	}
+
+	flags = (state->rs_all_visible ? VISIBILITYMAP_ALL_VISIBLE : 0) |
+			(state->rs_all_frozen ? VISIBILITYMAP_ALL_FROZEN : 0);
+
+	map = PageGetContents(state->rs_vm_buffer);
+	map[mapByte] |= (flags << mapOffset);
+}
+
+/*
+ * Update rs_all_visible and rs_all_frozen flags according to the tuple.  We
+ * use simplified check assuming that HeapTupleSatisfiesVacuum() should already
+ * set tuple hint bits.
+ */
+static void
+rewrite_update_vm_flags(RewriteState state, HeapTuple tuple)
+{
+	TransactionId	xmin;
+
+	if (!state->rs_all_visible)
+		return;
+
+	if (!HeapTupleHeaderXminCommitted(tuple->t_data))
+	{
+		state->rs_all_visible = false;
+		state->rs_all_frozen = false;
+		return;
+	}
+
+	xmin = HeapTupleHeaderGetXmin(tuple->t_data);
+	if (!TransactionIdPrecedes(xmin, state->rs_oldest_xmin))
+	{
+		state->rs_all_visible = false;
+		state->rs_all_frozen = false;
+		return;
+	}
+
+	if (!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID) &&
+		!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask))
+	{
+		state->rs_all_visible = false;
+		state->rs_all_frozen = false;
+		return;
+	}
+
+	if (!state->rs_all_frozen)
+		return;
+
+	if (heap_tuple_needs_eventual_freeze(tuple->t_data))
+		state->rs_all_frozen = false;
+}
+
 /*
  * Add a tuple to the new heap.
  *
@@ -472,6 +593,7 @@ rewrite_heap_tuple(RewriteState state,
 
 		/* Insert the tuple and find out where it's put in new_heap */
 		raw_heap_insert(state, new_tuple);
+		rewrite_update_vm_flags(state, new_tuple);
 		new_tid = new_tuple->t_self;
 
 		logical_rewrite_heap_tuple(state, old_tid, new_tuple);
@@ -677,6 +799,9 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
 			 * enforce saveFreeSpace unconditionally.
 			 */
 
+			if (state->rs_all_visible)
+				PageSetAllVisible(page);
+
 			/* XLOG stuff */
 			if (RelationNeedsWAL(state->rs_new_rel))
 				log_newpage(&state->rs_new_rel->rd_node,
@@ -695,6 +820,8 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
 			smgrextend(RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM,
 					   state->rs_blockno, (char *) page, true);
 
+			rewrite_set_vm_flags(state);
+
 			state->rs_blockno++;
 			state->rs_buffer_valid = false;
 		}
diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
index 114fbbdd307..dd0e686a9fc 100644
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -99,24 +99,6 @@
 
 /*#define TRACE_VISIBILITYMAP */
 
-/*
- * Size of the bitmap on each visibility map page, in bytes. There's no
- * extra headers, so the whole page minus the standard page header is
- * used for the bitmap.
- */
-#define MAPSIZE (BLCKSZ - MAXALIGN(SizeOfPageHeaderData))
-
-/* Number of heap blocks we can represent in one byte */
-#define HEAPBLOCKS_PER_BYTE (BITS_PER_BYTE / BITS_PER_HEAPBLOCK)
-
-/* Number of heap blocks we can represent in one visibility map page. */
-#define HEAPBLOCKS_PER_PAGE (MAPSIZE * HEAPBLOCKS_PER_BYTE)
-
-/* Mapping from heap block number to the right bit in the visibility map */
-#define HEAPBLK_TO_MAPBLOCK(x) ((x) / HEAPBLOCKS_PER_PAGE)
-#define HEAPBLK_TO_MAPBYTE(x) (((x) % HEAPBLOCKS_PER_PAGE) / HEAPBLOCKS_PER_BYTE)
-#define HEAPBLK_TO_OFFSET(x) (((x) % HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK)
-
 /* Masks for counting subsets of bits in the visibility map. */
 #define VISIBLE_MASK64	UINT64CONST(0x5555555555555555) /* The lower bit of each
 														 * bit pair */
diff --git a/src/include/access/visibilitymap.h b/src/include/access/visibilitymap.h
index 0981b218ea0..c902900d895 100644
--- a/src/include/access/visibilitymap.h
+++ b/src/include/access/visibilitymap.h
@@ -26,6 +26,24 @@
 #define VM_ALL_FROZEN(r, b, v) \
 	((visibilitymap_get_status((r), (b), (v)) & VISIBILITYMAP_ALL_FROZEN) != 0)
 
+/*
+ * Size of the bitmap on each visibility map page, in bytes. There's no
+ * extra headers, so the whole page minus the standard page header is
+ * used for the bitmap.
+ */
+#define MAPSIZE (BLCKSZ - MAXALIGN(SizeOfPageHeaderData))
+
+/* Number of heap blocks we can represent in one byte */
+#define HEAPBLOCKS_PER_BYTE (BITS_PER_BYTE / BITS_PER_HEAPBLOCK)
+
+/* Number of heap blocks we can represent in one visibility map page. */
+#define HEAPBLOCKS_PER_PAGE (MAPSIZE * HEAPBLOCKS_PER_BYTE)
+
+/* Mapping from heap block number to the right bit in the visibility map */
+#define HEAPBLK_TO_MAPBLOCK(x) ((x) / HEAPBLOCKS_PER_PAGE)
+#define HEAPBLK_TO_MAPBYTE(x) (((x) % HEAPBLOCKS_PER_PAGE) / HEAPBLOCKS_PER_BYTE)
+#define HEAPBLK_TO_OFFSET(x) (((x) % HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK)
+
 extern bool visibilitymap_clear(Relation rel, BlockNumber heapBlk,
 								Buffer vmbuf, uint8 flags);
 extern void visibilitymap_pin(Relation rel, BlockNumber heapBlk,
-- 
2.17.1

v3-0002-refactor-the-code-to-make-sure-that-logic-is-not-.patchtext/x-diff; charset=us-asciiDownload
From 1112e333a32f9347f38f75e2750cb132799f63d4 Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Sun, 29 Aug 2021 16:01:13 -0500
Subject: [PATCH v3 2/3] refactor the code to make sure that logic is not
 duplicated.

0001-write-vm-during-cluster-4.patch
Date: Mon, 28 Jun 2021 11:22:01 +0300
From: Anna Akenteva <a.akenteva@postgrespro.ru>
---
 src/backend/access/heap/heapam_handler.c | 19 +++---
 src/backend/access/heap/rewriteheap.c    | 12 +++-
 src/backend/access/heap/vacuumlazy.c     | 75 ++++++++++++++++--------
 src/backend/utils/sort/tuplesort.c       |  9 ++-
 src/include/access/heapam.h              |  3 +
 src/include/access/rewriteheap.h         |  4 +-
 src/include/utils/tuplesort.h            |  4 +-
 7 files changed, 87 insertions(+), 39 deletions(-)

diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index 9befe012a9e..90361080d02 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -47,7 +47,8 @@
 
 static void reform_and_rewrite_tuple(HeapTuple tuple,
 									 Relation OldHeap, Relation NewHeap,
-									 Datum *values, bool *isnull, RewriteState rwstate);
+									 Datum *values, bool *isnull, bool islive,
+									 RewriteState rwstate);
 
 static bool SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer,
 								   HeapTuple tuple,
@@ -782,6 +783,7 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
 		HeapTuple	tuple;
 		Buffer		buf;
 		bool		isdead;
+		bool		islive = false;
 
 		CHECK_FOR_INTERRUPTS();
 
@@ -848,6 +850,7 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
 			case HEAPTUPLE_LIVE:
 				/* Live or recently dead, must copy it */
 				isdead = false;
+				islive = true;
 				break;
 			case HEAPTUPLE_INSERT_IN_PROGRESS:
 
@@ -903,7 +906,7 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
 		*num_tuples += 1;
 		if (tuplesort != NULL)
 		{
-			tuplesort_putheaptuple(tuplesort, tuple);
+			tuplesort_putheaptuple(tuplesort, tuple, islive);
 
 			/*
 			 * In scan-and-sort mode, report increase in number of tuples
@@ -921,7 +924,7 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
 			int64		ct_val[2];
 
 			reform_and_rewrite_tuple(tuple, OldHeap, NewHeap,
-									 values, isnull, rwstate);
+									 values, isnull, islive, rwstate);
 
 			/*
 			 * In indexscan mode and also VACUUM FULL, report increase in
@@ -961,17 +964,18 @@ heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
 		for (;;)
 		{
 			HeapTuple	tuple;
+			bool		islive;
 
 			CHECK_FOR_INTERRUPTS();
 
-			tuple = tuplesort_getheaptuple(tuplesort, true);
+			tuple = tuplesort_getheaptuple(tuplesort, true, &islive);
 			if (tuple == NULL)
 				break;
 
 			n_tuples += 1;
 			reform_and_rewrite_tuple(tuple,
 									 OldHeap, NewHeap,
-									 values, isnull,
+									 values, isnull, islive,
 									 rwstate);
 			/* Report n_tuples */
 			pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_TUPLES_WRITTEN,
@@ -2458,7 +2462,8 @@ heapam_scan_sample_next_tuple(TableScanDesc scan, SampleScanState *scanstate,
 static void
 reform_and_rewrite_tuple(HeapTuple tuple,
 						 Relation OldHeap, Relation NewHeap,
-						 Datum *values, bool *isnull, RewriteState rwstate)
+						 Datum *values, bool *isnull, bool islive,
+						 RewriteState rwstate)
 {
 	TupleDesc	oldTupDesc = RelationGetDescr(OldHeap);
 	TupleDesc	newTupDesc = RelationGetDescr(NewHeap);
@@ -2477,7 +2482,7 @@ reform_and_rewrite_tuple(HeapTuple tuple,
 	copiedTuple = heap_form_tuple(newTupDesc, values, isnull);
 
 	/* The heap rewrite module does the rest */
-	rewrite_heap_tuple(rwstate, tuple, copiedTuple);
+	rewrite_heap_tuple(rwstate, islive, tuple, copiedTuple);
 
 	heap_freetuple(copiedTuple);
 }
diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index af0d1b2d0c8..623149ac262 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -424,6 +424,8 @@ rewrite_set_vm_flags(RewriteState state)
 	map[mapByte] |= (flags << mapOffset);
 }
 
+// rewrite_update_vm_flags(state, new_tuple);
+
 /*
  * Update rs_all_visible and rs_all_frozen flags according to the tuple.  We
  * use simplified check assuming that HeapTupleSatisfiesVacuum() should already
@@ -475,11 +477,12 @@ rewrite_update_vm_flags(RewriteState state, HeapTuple tuple)
  * it had better be temp storage not a pointer to the original tuple.
  *
  * state		opaque state as returned by begin_heap_rewrite
+ * is_live		whether the currently processed tuple is live
  * old_tuple	original tuple in the old heap
  * new_tuple	new, rewritten tuple to be inserted to new heap
  */
 void
-rewrite_heap_tuple(RewriteState state,
+rewrite_heap_tuple(RewriteState state, bool is_live,
 				   HeapTuple old_tuple, HeapTuple new_tuple)
 {
 	MemoryContext old_cxt;
@@ -593,7 +596,12 @@ rewrite_heap_tuple(RewriteState state,
 
 		/* Insert the tuple and find out where it's put in new_heap */
 		raw_heap_insert(state, new_tuple);
-		rewrite_update_vm_flags(state, new_tuple);
+
+		heap_page_update_vm_flags(new_tuple, state->rs_oldest_xmin,
+								 is_live, true,
+								 &(state->rs_all_visible),
+								 &(state->rs_all_frozen));
+
 		new_tid = new_tuple->t_self;
 
 		logical_rewrite_heap_tuple(state, old_tid, new_tuple);
diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index cd603e6aa41..f717a9557a7 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -3015,6 +3015,53 @@ dead_items_cleanup(LVRelState *vacrel)
 	vacrel->pvs = NULL;
 }
 
+/*
+ * Update all_visible and all_frozen flags according to the tuple.  We
+ * use simplified check assuming that HeapTupleSatisfiesVacuum() should already
+ * set tuple hint bits.
+ */
+void
+heap_page_update_vm_flags(HeapTuple tuple, TransactionId OldestXmin,
+						  bool is_live, bool check_infomask,
+						  bool *all_visible, bool *all_frozen)
+{
+	TransactionId xmin;
+
+	/* Check comments in lazy_scan_prune. */
+	if (!HeapTupleHeaderXminCommitted(tuple->t_data) || !is_live)
+	{
+		*all_visible = false;
+		*all_frozen = false;
+		return;
+	}
+
+	/*
+	 * The inserter definitely committed. But is it old enough
+	 * that everyone sees it as committed?
+	 */
+	xmin = HeapTupleHeaderGetXmin(tuple->t_data);
+	if (!TransactionIdPrecedes(xmin, OldestXmin))
+	{
+		*all_visible = false;
+		*all_frozen = false;
+		return;
+	}
+
+	if (check_infomask &&
+		!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID) &&
+		!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask))
+	{
+		*all_visible = false;
+		*all_frozen = false;
+		return;
+	}
+
+	/* Check whether this tuple is already frozen or not */
+	if (*all_visible && *all_frozen &&
+		heap_tuple_needs_eventual_freeze(tuple->t_data))
+		*all_frozen = false;
+}
+
 /*
  * Check if every tuple in the given page is visible to all current and future
  * transactions. Also return the visibility_cutoff_xid which is the highest
@@ -3084,34 +3131,14 @@ heap_page_is_all_visible(LVRelState *vacrel, Buffer buf,
 				{
 					TransactionId xmin;
 
-					/* Check comments in lazy_scan_prune. */
-					if (!HeapTupleHeaderXminCommitted(tuple.t_data))
-					{
-						all_visible = false;
-						*all_frozen = false;
-						break;
-					}
-
-					/*
-					 * The inserter definitely committed. But is it old enough
-					 * that everyone sees it as committed?
-					 */
-					xmin = HeapTupleHeaderGetXmin(tuple.t_data);
-					if (!TransactionIdPrecedes(xmin, vacrel->OldestXmin))
-					{
-						all_visible = false;
-						*all_frozen = false;
-						break;
-					}
+					heap_page_update_vm_flags(&tuple, vacrel->OldestXmin,
+											  true, false,
+											  &all_visible, all_frozen);
 
 					/* Track newest xmin on page. */
+					xmin = HeapTupleHeaderGetXmin(tuple.t_data);
 					if (TransactionIdFollows(xmin, *visibility_cutoff_xid))
 						*visibility_cutoff_xid = xmin;
-
-					/* Check whether this tuple is already frozen or not */
-					if (all_visible && *all_frozen &&
-						heap_tuple_needs_eventual_freeze(tuple.t_data))
-						*all_frozen = false;
 				}
 				break;
 
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index 90e26745dff..97b783e4a14 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -184,6 +184,7 @@ typedef struct
 	Datum		datum1;			/* value of first key column */
 	bool		isnull1;		/* is first key column NULL? */
 	int			srctape;		/* source tape number */
+	bool		is_live;		/* is the tuple alive? */
 } SortTuple;
 
 /*
@@ -1706,7 +1707,7 @@ tuplesort_puttupleslot(Tuplesortstate *state, TupleTableSlot *slot)
  * Note that the input data is always copied; the caller need not save it.
  */
 void
-tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup)
+tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup, bool is_live)
 {
 	MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
 	SortTuple	stup;
@@ -1718,6 +1719,7 @@ tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup)
 	COPYTUP(state, &stup, (void *) tup);
 
 	puttuple_common(state, &stup);
+	stup.is_live = is_live;
 
 	MemoryContextSwitchTo(oldcontext);
 }
@@ -2442,7 +2444,7 @@ tuplesort_gettupleslot(Tuplesortstate *state, bool forward, bool copy,
  * remaining valid after any further manipulation of tuplesort.
  */
 HeapTuple
-tuplesort_getheaptuple(Tuplesortstate *state, bool forward)
+tuplesort_getheaptuple(Tuplesortstate *state, bool forward, bool *is_live)
 {
 	MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext);
 	SortTuple	stup;
@@ -2452,6 +2454,9 @@ tuplesort_getheaptuple(Tuplesortstate *state, bool forward)
 
 	MemoryContextSwitchTo(oldcontext);
 
+	if (is_live)
+		*is_live = stup.is_live;
+
 	return stup.tuple;
 }
 
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index f3fb1e93a59..4ecc92d1aee 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -198,6 +198,9 @@ extern void heap_get_root_tuples(Page page, OffsetNumber *root_offsets);
 struct VacuumParams;
 extern void heap_vacuum_rel(Relation rel,
 							struct VacuumParams *params, BufferAccessStrategy bstrategy);
+extern void heap_page_update_vm_flags(HeapTuple tuple, TransactionId OldestXmin,
+									  bool is_live, bool check_infomask,
+									  bool *all_visible, bool *all_frozen);
 
 /* in heap/heapam_visibility.c */
 extern bool HeapTupleSatisfiesVisibility(HeapTuple stup, Snapshot snapshot,
diff --git a/src/include/access/rewriteheap.h b/src/include/access/rewriteheap.h
index 121f5524051..2cda7320d36 100644
--- a/src/include/access/rewriteheap.h
+++ b/src/include/access/rewriteheap.h
@@ -25,8 +25,8 @@ extern RewriteState begin_heap_rewrite(Relation OldHeap, Relation NewHeap,
 									   TransactionId OldestXmin, TransactionId FreezeXid,
 									   MultiXactId MultiXactCutoff);
 extern void end_heap_rewrite(RewriteState state);
-extern void rewrite_heap_tuple(RewriteState state, HeapTuple oldTuple,
-							   HeapTuple newTuple);
+extern void rewrite_heap_tuple(RewriteState state, bool is_live,
+							   HeapTuple oldTuple, HeapTuple newTuple);
 extern bool rewrite_heap_dead_tuple(RewriteState state, HeapTuple oldTuple);
 
 /*
diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h
index f94949370b4..45442c43515 100644
--- a/src/include/utils/tuplesort.h
+++ b/src/include/utils/tuplesort.h
@@ -232,7 +232,7 @@ extern bool tuplesort_used_bound(Tuplesortstate *state);
 
 extern void tuplesort_puttupleslot(Tuplesortstate *state,
 								   TupleTableSlot *slot);
-extern void tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup);
+extern void tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup, bool is_live);
 extern void tuplesort_putindextuplevalues(Tuplesortstate *state,
 										  Relation rel, ItemPointer self,
 										  Datum *values, bool *isnull);
@@ -243,7 +243,7 @@ extern void tuplesort_performsort(Tuplesortstate *state);
 
 extern bool tuplesort_gettupleslot(Tuplesortstate *state, bool forward,
 								   bool copy, TupleTableSlot *slot, Datum *abbrev);
-extern HeapTuple tuplesort_getheaptuple(Tuplesortstate *state, bool forward);
+extern HeapTuple tuplesort_getheaptuple(Tuplesortstate *state, bool forward, bool *is_live);
 extern IndexTuple tuplesort_getindextuple(Tuplesortstate *state, bool forward);
 extern bool tuplesort_getdatum(Tuplesortstate *state, bool forward,
 							   Datum *val, bool *isNull, Datum *abbrev);
-- 
2.17.1

v3-0003-cluster-set-relallvisible-num_pages.patchtext/x-diff; charset=us-asciiDownload
From 098d3ff6c73c7aa9f798203f19f434eb88d92d94 Mon Sep 17 00:00:00 2001
From: Justin Pryzby <pryzbyj@telsasoft.com>
Date: Sun, 29 Aug 2021 16:55:38 -0500
Subject: [PATCH v3 3/3] cluster: set relallvisible=num_pages

---
 src/backend/commands/cluster.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c
index 51ef2f2953e..e19121915f2 100644
--- a/src/backend/commands/cluster.c
+++ b/src/backend/commands/cluster.c
@@ -1021,6 +1021,7 @@ copy_table_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, bool verbose,
 	relform = (Form_pg_class) GETSTRUCT(reltup);
 
 	relform->relpages = num_pages;
+	relform->relallvisible = num_pages;
 	relform->reltuples = num_tuples;
 
 	/* Don't update the stats for pg_class.  See swap_relation_files. */
-- 
2.17.1

#14Justin Pryzby
pryzby@telsasoft.com
In reply to: Justin Pryzby (#13)
Re: Write visibility map during CLUSTER/VACUUM FULL

On Sun, Dec 26, 2021 at 08:59:31PM -0600, Justin Pryzby wrote:

Rebased on 8e1fae193864527c931a704bd7908e4fbc983f5c.

Would someone step up to "own" this patch ?

If not, its CF entry may need to be closed (there's no status for "needs
author").

I'm planning to close this patch until someone can shepherd it.

--
Justin

#15Jacob Champion
jchampion@timescale.com
In reply to: Justin Pryzby (#14)
Re: Write visibility map during CLUSTER/VACUUM FULL

Justin Pryzby <pryzby@telsasoft.com> wrote:

I'm planning to close this patch until someone can shepherd it.

I've marked it RwF for now.

--Jacob