From 85d251c614bb8ea7692bb1f403a982a2c6c007da Mon Sep 17 00:00:00 2001
From: Bruce Momjian <bruce@momjian.us>
Date: Tue, 6 Apr 2021 14:23:46 -0400
Subject: [PATCH] cfe-10-hint_over_cfe-09-test squash commit

---
 doc/src/sgml/config.sgml                 |   7 +-
 src/backend/access/rmgrdesc/xlogdesc.c   |   6 +-
 src/backend/access/transam/xlog.c        |   4 +
 src/backend/access/transam/xloginsert.c  |  24 +++++
 src/backend/replication/logical/decode.c |   1 +
 src/backend/storage/buffer/bufmgr.c      | 128 +++++++++++++++++------
 src/include/access/xlog.h                |  14 +--
 src/include/access/xloginsert.h          |   2 +
 src/include/catalog/pg_control.h         |   1 +
 9 files changed, 143 insertions(+), 44 deletions(-)
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index d7b6e368d0..6bcd9c64df 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -3093,9 +3093,10 @@ include_dir 'conf.d'
        </para>
 
        <para>
-        If data checksums are enabled, hint bit updates are always WAL-logged
-        and this setting is ignored. You can use this setting to test how much
-        extra WAL-logging would occur if your database had data checksums
+        If data checksums or cluster file encryption is enabled,
+        hint bit updates are always WAL-logged and this setting is
+        ignored. You can use this setting to test how much extra
+        WAL-logging would occur if your database had data checksums
         enabled.
        </para>
 
diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c
index e6090a9dad..d5474990d6 100644
--- a/src/backend/access/rmgrdesc/xlogdesc.c
+++ b/src/backend/access/rmgrdesc/xlogdesc.c
@@ -80,7 +80,8 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
 
 		appendStringInfoString(buf, xlrec->rp_name);
 	}
-	else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT)
+	else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT ||
+			 info == XLOG_ENCRYPTION_LSN)
 	{
 		/* no further information to print */
 	}
@@ -184,6 +185,9 @@ xlog_identify(uint8 info)
 		case XLOG_FPI_FOR_HINT:
 			id = "FPI_FOR_HINT";
 			break;
+		case XLOG_ENCRYPTION_LSN:
+			id = "ENCRYPTION_LSN";
+			break;
 	}
 
 	return id;
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 51165f6593..dba690bc5e 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -10353,6 +10353,10 @@ xlog_redo(XLogReaderState *record)
 			UnlockReleaseBuffer(buffer);
 		}
 	}
+	else if (info == XLOG_ENCRYPTION_LSN)
+	{
+		/* nothing to do here */
+	}
 	else if (info == XLOG_BACKUP_END)
 	{
 		XLogRecPtr	startpoint;
diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index 7052dc245e..c76ed1f881 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -980,6 +980,30 @@ XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
 	return recptr;
 }
 
+/*
+ * This function returns either a WAL or fake LSN, for use by encryption.
+ */
+XLogRecPtr
+LSNForEncryption(bool use_wal_lsn)
+{
+	if (use_wal_lsn)
+	{
+		int			dummy = 0;
+
+		Assert(FileEncryptionEnabled);
+		/*
+		 * Records other than SWITCH_WAL must have content. We use an integer 0 to
+		 * follow the restriction.
+		 */
+		XLogBeginInsert();
+		XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
+		XLogRegisterData((char *) &dummy, sizeof(dummy));
+		return XLogInsert(RM_XLOG_ID, XLOG_ENCRYPTION_LSN);
+	}
+	else
+		return GetFakeLSNForUnloggedRel();
+}
+
 /*
  * Write a WAL record containing a full image of a page. Caller is responsible
  * for writing the page to disk after calling this routine.
diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c
index 97be4b0f23..48400fde2a 100644
--- a/src/backend/replication/logical/decode.c
+++ b/src/backend/replication/logical/decode.c
@@ -224,6 +224,7 @@ DecodeXLogOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
 		case XLOG_FPW_CHANGE:
 		case XLOG_FPI_FOR_HINT:
 		case XLOG_FPI:
+		case XLOG_ENCRYPTION_LSN:
 			break;
 		default:
 			elog(ERROR, "unexpected RM_XLOG_ID record type: %u", info);
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 852138f9c9..367fed5e85 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -3766,11 +3766,12 @@ IncrBufferRefCount(Buffer buffer)
  * This is essentially the same as MarkBufferDirty, except:
  *
  * 1. The caller does not write WAL; so if checksums are enabled, we may need
- *	  to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages.
+ *    to write an XLOG_FPI_FOR_HINT record to protect against torn pages, or
+ *    XLOG_ENCRYPTION_LSN to generate a new LSN for the page.
  * 2. The caller might have only share-lock instead of exclusive-lock on the
- *	  buffer's content lock.
+ *    buffer's content lock.
  * 3. This function does not guarantee that the buffer is always marked dirty
- *	  (due to a race condition), so it cannot be used for important changes.
+ *    (due to a race condition), so it cannot be used for important changes.
  */
 void
 MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
@@ -3816,51 +3817,110 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
 		 * If we need to protect hint bit updates from torn writes, WAL-log a
 		 * full page image of the page. This full page image is only necessary
 		 * if the hint bit update is the first change to the page since the
-		 * last checkpoint.
+		 * last checkpoint.  If cluster file encryption is enabled, we also
+		 * need to generate new page LSNs for all other cases of page writes.
 		 *
 		 * We don't check full_page_writes here because that logic is included
 		 * when we call XLogInsert() since the value changes dynamically.
 		 */
-		if (XLogHintBitIsNeeded() &&
-			(pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
+		if (XLogHintBitIsNeeded())
 		{
 			/*
-			 * If we must not write WAL, due to a relfilenode-specific
-			 * condition or being in recovery, don't dirty the page.  We can
-			 * set the hint, just not dirty the page as a result so the hint
-			 * is lost when we evict the page or shutdown.
+			 * If we must not write WAL during recovery so don't dirty the page.
+			 * We can set the hint, just not dirty the page as a result so the
+			 * hint is lost when we evict the page or shutdown.
 			 *
 			 * See src/backend/storage/page/README for longer discussion.
+			 * XXX Can this be improved?
 			 */
-			if (RecoveryInProgress() ||
-				RelFileNodeSkippingWAL(bufHdr->tag.rnode))
+			if	(RecoveryInProgress() ||
+				 (RelFileNodeSkippingWAL(bufHdr->tag.rnode) &&
+				  !FileEncryptionEnabled))
 				return;
 
 			/*
-			 * If the block is already dirty because we either made a change
-			 * or set a hint already, then we don't need to write a full page
-			 * image.  Note that aggressive cleaning of blocks dirtied by hint
-			 * bit setting would increase the call rate. Bulk setting of hint
-			 * bits would reduce the call rate...
-			 *
-			 * We must issue the WAL record before we mark the buffer dirty.
-			 * Otherwise we might write the page before we write the WAL. That
-			 * causes a race condition, since a checkpoint might occur between
-			 * writing the WAL record and marking the buffer dirty. We solve
-			 * that with a kluge, but one that is already in use during
-			 * transaction commit to prevent race conditions. Basically, we
-			 * simply prevent the checkpoint WAL record from being written
-			 * until we have marked the buffer dirty. We don't start the
-			 * checkpoint flush until we have marked dirty, so our checkpoint
-			 * must flush the change to disk successfully or the checkpoint
-			 * never gets written, so crash recovery will fix.
+			 * Non-BM_PERMANENT objects don't need full page images because
+			 * they are not restored.  WAL-skipped relfilenodes should never
+			 * have full page images generated.
+			 */
+			if (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT &&
+				!RelFileNodeSkippingWAL(bufHdr->tag.rnode))
+			{
+				/*
+				 * If the block is already dirty because we either made a change
+				 * or set a hint already, then we don't need to write a full
+				 * page image.  Note that aggressive cleaning of blocks dirtied
+				 * by hint bit setting would increase the call rate. Bulk
+				 * setting of hint bits would reduce the call rate...
+				 *
+				 * We must issue the WAL record before we mark the buffer
+				 * dirty.  Otherwise we might write the page before we write
+				 * the WAL. That causes a race condition, since a checkpoint
+				 * might occur between writing the WAL record and marking the
+				 * buffer dirty. We solve that with a kluge, but one that is
+				 * already in use during transaction commit to prevent race
+				 * conditions. Basically, we simply prevent the checkpoint WAL
+				 * record from being written until we have marked the buffer
+				 * dirty. We don't start the checkpoint flush until we have
+				 * marked dirty, so our checkpoint must flush the change to disk
+				 * successfully or the checkpoint never gets written, so crash
+				 * recovery will fix.
+				 *
+				 * It's possible we may enter here without an xid, so it
+				 * is essential that CreateCheckpoint waits for virtual
+				 * transactions rather than full transactionids.
+				 */
+				MyProc->delayChkpt = delayChkpt = true;
+				lsn = XLogSaveBufferForHint(buffer, buffer_std);
+			}
+
+			/*
+			 * Above, for hint bit changes, we might have generated a new page
+			 * LSN and a full-page WAL record for a page's first-clean-to-dirty
+			 * during a checkpoint for permanent, non-WAL-skipped relfilenodes.
+			 * If we didn't (the lsn variable is invalid), and we are
+			 * doing cluster file encryption, we must generate a new
+			 * page LSN here for either non-permanent relations or page
+			 * non-first-clean-to-dirty during a checkpoint.  (Cluster file
+			 * encryption does not support WAL-skip relfilenodes.)  We must
+			 * update the page LSN even if the page with the hint bit change is
+			 * later overwritten in the file system with an earlier version of
+			 * the page during crash recovery.
 			 *
-			 * It's possible we may enter here without an xid, so it is
-			 * essential that CreateCheckpoint waits for virtual transactions
-			 * rather than full transactionids.
+			 * XXX Can we rely on the full page write above with no lock being
+			 * held to avoid torn pages?  Above, the LSN and page image are
+			 * tied together, but here is just the page LSN update.
 			 */
-			MyProc->delayChkpt = delayChkpt = true;
-			lsn = XLogSaveBufferForHint(buffer, buffer_std);
+			if (XLogRecPtrIsInvalid(lsn) && FileEncryptionEnabled)
+			{
+				/*
+				 * For cluster file encryption we need a new page LSN because
+				 * the LSN is used, with the page number and permanent flag, as
+				 * part of the nonce, and the nonce must be unique for every
+				 * page write.  If we reencrypt a page with hint bit changes
+				 * using the same nonce as previous writes, it would expose the
+				 * hint bit change locations.  To avoid this, we write a simple
+				 * WAL record to advance the lsn, which can then be assigned to
+				 * the page below.
+				 *
+				 * Above we are relying on the full page writes to revert
+				 * any partial pages writes caused by this LSN change for
+				 * permanent, non-WAL-skip relfilenodes.  For non-permanent
+				 * relations, they crash recover as empty.  For WAL-skip
+				 * relfilenodes, they recover with their original contents, so
+				 * that works too.
+				 */
+				/* XXX Do we need the checkpoint delay here? */
+				MyProc->delayChkpt = delayChkpt = true;
+				/*
+				 * XXX We probably don't need to replay this WAL on the primary
+				 * since the full page image is restored, but do we have
+				 * to replay this on the repicas (for relations that are
+				 * replicated)?
+				 */
+				lsn = LSNForEncryption(
+						pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT);
+			}
 		}
 
 		buf_state = LockBufHdr(bufHdr);
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index ab66a660a5..52f5a42f50 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -203,13 +203,15 @@ extern PGDLLIMPORT int wal_level;
 /*
  * Is a full-page image needed for hint bit updates?
  *
- * Normally, we don't WAL-log hint bit updates, but if checksums are enabled,
- * we have to protect them against torn page writes.  When you only set
- * individual bits on a page, it's still consistent no matter what combination
- * of the bits make it to disk, but the checksum wouldn't match.  Also WAL-log
- * them if forced by wal_log_hints=on.
+ * Normally, we don't WAL-log hint bit updates, but if checksums or encryption
+ * is enabled, we have to protect them against torn page writes.  When you
+ * only set individual bits on a page, it's still consistent no matter what
+ * combination of the bits make it to disk, but the checksum wouldn't match.
+ * Cluster file encryption requires a new LSN for hint bit changes, and can't
+ * tolerate torn pages.  Also WAL-log them if forced by wal_log_hints=on.
  */
-#define XLogHintBitIsNeeded() (DataChecksumsEnabled() || wal_log_hints)
+#define XLogHintBitIsNeeded() \
+		(DataChecksumsEnabled() || FileEncryptionEnabled || wal_log_hints)
 
 /* Do we need to WAL-log information required only for Hot Standby and logical replication? */
 #define XLogStandbyInfoActive() (wal_level >= WAL_LEVEL_REPLICA)
diff --git a/src/include/access/xloginsert.h b/src/include/access/xloginsert.h
index f1d8c39edf..e7ee674dd4 100644
--- a/src/include/access/xloginsert.h
+++ b/src/include/access/xloginsert.h
@@ -61,6 +61,8 @@ extern void log_newpage_range(Relation rel, ForkNumber forkNum,
 							  BlockNumber startblk, BlockNumber endblk, bool page_std);
 extern XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std);
 
+extern XLogRecPtr LSNForEncryption(bool use_wal_lsn);
+
 extern void InitXLogInsert(void);
 
 #endif							/* XLOGINSERT_H */
diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h
index 6216d1faf5..e23693da4f 100644
--- a/src/include/catalog/pg_control.h
+++ b/src/include/catalog/pg_control.h
@@ -76,6 +76,7 @@ typedef struct CheckPoint
 #define XLOG_END_OF_RECOVERY			0x90
 #define XLOG_FPI_FOR_HINT				0xA0
 #define XLOG_FPI						0xB0
+#define XLOG_ENCRYPTION_LSN				0xC0
 
 
 /*
-- 
2.20.1