>From 4b36f4c0e1efecbd50918a2c854bfe6f26105201 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Fri, 30 Jan 2015 09:46:15 +0100
Subject: [PATCH 2/4] Fix startup process crash and race during
 CREATE/DROP/ALTER DATABASE.

When replaying database related WAL records the startup process
acquires lock on the database object to prevent new connections to the
affected database. That's necessary because we currently don't
replicate object locks from the primary.

Unfortunately that locking couldn't use the standby infrastructure for
managing locks and instead just used a normal
LockSharedObjectForSession(). While that works in the (common!)
uncontended situation, it doesn't if the lock is currently already
acquired. The standby process intentionally doesn't set up the
infrastructure to wait for locks in the normal, blocking, way. The
goal is to cancel other lock holder after max_standby_*_delay, which
is only possible if the sleeping is controlled. If we have to wait
anyway, we either crash with a assertion or a FATAL (promoted to
PANIC) error.

There's also another race, which was documented in comments. Namely
that not using the standby lock infrastructure requires explicitly
releasing the lock. That means there's a short phase where the catalog
and physical state of the database are inconsistent.

After the previous commit we can now use the standby infrastructure to
manage the lock. That requires assigning a transactionid to the
create/drop database transactions. To handle WAL generated by a older
point release we thus need to make the locking conditional on the
relevant records having a transactionid. On the primary WAL format is
bumped instead.

Additionally add previously missing locking for CREATE DATABASE that
could lead to checksum failures due to hint bit writes.

Now that we don't do acquire any locks in potentially blocking mode in
the startup process anymore, add a assertion ensuring that that stays
so in the primary.

Backpatch all the way.

Discussion: 20150120152819.GC24381@alap3.anarazel.de
---
 src/backend/commands/dbcommands.c | 71 +++++++++++++++++++++++++++++++--------
 src/backend/storage/lmgr/lock.c   |  2 ++
 2 files changed, 59 insertions(+), 14 deletions(-)

diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 5e66961..3845eb7 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -1204,6 +1204,12 @@ movedb(const char *dbname, const char *tblspcname)
 	}
 
 	/*
+	 * Force xid assignment, for the benefit of dbase_redo()'s internal
+	 * locking.
+	 */
+	GetTopTransactionId();
+
+	/*
 	 * Use an ENSURE block to make sure we remove the debris if the copy fails
 	 * (eg, due to out-of-disk-space).  This is not a 100% solution, because
 	 * of the possibility of failure during transaction commit, but it should
@@ -1314,6 +1320,12 @@ movedb(const char *dbname, const char *tblspcname)
 	StartTransactionCommand();
 
 	/*
+	 * Force xid assignment, for the benefit of dbase_redo()'s internal
+	 * locking.
+	 */
+	GetTopTransactionId();
+
+	/*
 	 * Remove files from the old tablespace
 	 */
 	if (!rmtree(src_dbpath, true))
@@ -2053,6 +2065,34 @@ dbase_redo(XLogReaderState *record)
 		dst_path = GetDatabasePath(xlrec->db_id, xlrec->tablespace_id);
 
 		/*
+		 * Can only do the locking if the server generating the record was new
+		 * enough to know it has to assign a xid.
+		 */
+		if (InHotStandby && TransactionIdIsValid(XLogRecGetXid(record)))
+		{
+			LOCKTAG locktag;
+
+			SET_LOCKTAG_OBJECT(locktag,
+							   InvalidOid,
+							   DatabaseRelationId,
+							   xlrec->src_db_id,
+							   0);
+
+			/* Lock source database, do avoid concurrent hint bit writes et al. */
+			StandbyAcquireLock(XLogRecGetXid(record), &locktag, AccessExclusiveLock);
+			ResolveRecoveryConflictWithDatabase(xlrec->src_db_id);
+
+			/* Lock target database, it'll be overwritten in a second */
+			SET_LOCKTAG_OBJECT(locktag,
+							   InvalidOid,
+							   DatabaseRelationId,
+							   xlrec->db_id,
+							   0);
+			StandbyAcquireLock(XLogRecGetXid(record), &locktag, AccessExclusiveLock);
+			ResolveRecoveryConflictWithDatabase(xlrec->src_db_id);
+		}
+
+		/*
 		 * Our theory for replaying a CREATE is to forcibly drop the target
 		 * subdirectory if present, then re-copy the source data. This may be
 		 * more work than needed, but it is simple to implement.
@@ -2086,16 +2126,31 @@ dbase_redo(XLogReaderState *record)
 
 		dst_path = GetDatabasePath(xlrec->db_id, xlrec->tablespace_id);
 
-		if (InHotStandby)
+		/*
+		 * Can only do the locking if the server generating the record was new
+		 * enough to know it has to assign a xid.
+		 */
+		if (InHotStandby && TransactionIdIsValid(XLogRecGetXid(record)))
 		{
+			LOCKTAG locktag;
+
 			/*
 			 * Lock database while we resolve conflicts to ensure that
 			 * InitPostgres() cannot fully re-execute concurrently. This
 			 * avoids backends re-connecting automatically to same database,
 			 * which can happen in some cases.
 			 */
-			LockSharedObjectForSession(DatabaseRelationId, xlrec->db_id, 0, AccessExclusiveLock);
+			SET_LOCKTAG_OBJECT(locktag,
+							   InvalidOid,
+							   DatabaseRelationId,
+							   xlrec->db_id,
+							   0);
+			StandbyAcquireLock(XLogRecGetXid(record), &locktag, AccessExclusiveLock);
 			ResolveRecoveryConflictWithDatabase(xlrec->db_id);
+
+			/* Lock pg_database, to conflict with base backups */
+			SET_LOCKTAG_RELATION(locktag, 0, DatabaseRelationId);
+			StandbyAcquireLock(XLogRecGetXid(record), &locktag, RowExclusiveLock);
 		}
 
 		/* Drop pages for this database that are in the shared buffer cache */
@@ -2112,18 +2167,6 @@ dbase_redo(XLogReaderState *record)
 			ereport(WARNING,
 					(errmsg("some useless files may be left behind in old database directory \"%s\"",
 							dst_path)));
-
-		if (InHotStandby)
-		{
-			/*
-			 * Release locks prior to commit. XXX There is a race condition
-			 * here that may allow backends to reconnect, but the window for
-			 * this is small because the gap between here and commit is mostly
-			 * fairly small and it is unlikely that people will be dropping
-			 * databases that we are trying to connect to anyway.
-			 */
-			UnlockSharedObjectForSession(DatabaseRelationId, xlrec->db_id, 0, AccessExclusiveLock);
-		}
 	}
 	else
 		elog(PANIC, "dbase_redo: unknown op code %u", info);
diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c
index 02ecf3d..f051ad6 100644
--- a/src/backend/storage/lmgr/lock.c
+++ b/src/backend/storage/lmgr/lock.c
@@ -718,6 +718,8 @@ LockAcquireExtended(const LOCKTAG *locktag,
 						lockMethodTable->lockModeNames[lockmode]),
 				 errhint("Only RowExclusiveLock or less can be acquired on database objects during recovery.")));
 
+	Assert(!InRecovery || (sessionLock && dontWait));
+
 #ifdef LOCK_DEBUG
 	if (LOCK_DEBUG_ENABLED(locktag))
 		elog(LOG, "LockAcquire: lock [%u,%u] %s",
-- 
2.2.1.212.gc5b9256

