>From 891e527a6e48a69334eb5960a7a7add0625353d1 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Fri, 30 Jan 2015 09:35:41 +0100
Subject: [PATCH 4/4] Don't allow changing a database's tablespace during a
 basebackup.

A basebackup that's being run while a ALTER DATABASE ... SET
TABLESPACE is occuring can end up being corrupted because moving a
database doesn't use normal WAL mechanisms. It can happen that the
basebackup tries to backup a tablespace/database directory while
movedb() is removing the directory. That can lead to inconsistent old
and new database directories.  Normally such inconsistencies are fixed
by WAL replay, but the WAL doesn't contain the data of the database
that's moved and instead just recopies the database directories.

It would be nicer to fix this problem by making such database use
proper WAL logging, but that's not going to be backpatchable. So we
prevent basebackups from occurring while ALTER DATABASE ... SET
TABLESPACE is happening and the other way round.

We do this in one direction by acquiring a lock on pg_database when
starting a base backup, for the entirety of a streaming base backup
and when replaying database events.  For the case when ALTER DATABASE
... SET TABLESPACE is called when a exclusive (i.e. not a replication
protocol one) is in progress a error is raised. That's required
because we can't hold a lock between the pg_start_backup() and
pg_stop_backup() calls since they can be made in independent sessions.

That approach means that external code calling
do_pg_start_backup(not_exclusive) aren't protected, but there doesn't
seem to be a realistic way to protect those. They'll have to do the
locking themselves.

Discussion: 20150120152819.GC24381@alap3.anarazel.de

Backpatch all the way.
---
 src/backend/access/transam/xlog.c    | 29 +++++++++++++++++++++++++++++
 src/backend/commands/dbcommands.c    | 33 +++++++++++++++++++++++++++++++++
 src/backend/replication/basebackup.c | 15 +++++++++++++++
 src/backend/storage/lmgr/lmgr.c      | 31 +++++++++++++++++++++++++++++++
 src/backend/utils/init/postinit.c    |  2 +-
 src/include/access/xlog.h            |  1 +
 src/include/storage/lmgr.h           |  3 +++
 7 files changed, 113 insertions(+), 1 deletion(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 629a457..38e7dff 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -53,6 +53,7 @@
 #include "storage/ipc.h"
 #include "storage/large_object.h"
 #include "storage/latch.h"
+#include "storage/lmgr.h"
 #include "storage/pmsignal.h"
 #include "storage/predicate.h"
 #include "storage/proc.h"
@@ -9291,6 +9292,12 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 				 errmsg("backup label too long (max %d bytes)",
 						MAXPGPATH)));
+	/*
+	 * Acquire lock on pg datababase to prevent concurrent CREATE DATABASE and
+	 * ALTER DATABASE ... SET TABLESPACE from running. In case of an error
+	 * it'll be released by the transaction abort code.
+	 */
+	LockRelationOidForSession(DatabaseRelationId, ShareLock);
 
 	/*
 	 * Mark backup active in shared memory.  We must do full-page WAL writes
@@ -9523,6 +9530,8 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
 	}
 	PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
 
+	UnlockRelationOidForSession(DatabaseRelationId, ShareLock);
+
 	/*
 	 * We're done.  As a convenience, return the starting WAL location.
 	 */
@@ -9937,6 +9946,26 @@ do_pg_abort_backup(void)
 }
 
 /*
+ * Is a (exclusive or nonexclusive) base backup running?
+ *
+ * Note that this does not check whether any standby of this node has a
+ * basebackup running, or whether any upstream master (if this is a standby)
+ * has one in progress
+ */
+bool
+LocalBaseBackupInProgress(void)
+{
+	bool ret;
+
+	WALInsertLockAcquire();
+	ret = XLogCtl->Insert.exclusiveBackup ||
+		XLogCtl->Insert.nonExclusiveBackups > 0;
+	WALInsertLockRelease();
+
+	return ret;
+}
+
+/*
  * Get latest redo apply position.
  *
  * Exported to allow WALReceiver to read the pointer directly.
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 3845eb7..e6a3352 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -1071,6 +1071,9 @@ movedb(const char *dbname, const char *tblspcname)
 	LockSharedObjectForSession(DatabaseRelationId, db_id, 0,
 							   AccessExclusiveLock);
 
+	/* Acquire lock on pg_database against concurrent base backups starting */
+	LockRelationOidForSession(DatabaseRelationId, RowExclusiveLock);
+
 	/*
 	 * Permission checks
 	 */
@@ -1087,6 +1090,30 @@ movedb(const char *dbname, const char *tblspcname)
 				 errmsg("cannot change the tablespace of the currently open database")));
 
 	/*
+	 * Prevent SET TABLESPACE from running concurrently with a base
+	 * backup. Without that check a base backup would potentially copy a
+	 * partially removed source database; which WAL replay then would copy
+	 * over the new database...
+	 *
+	 * Starting a base backup takes a SHARE lock on pg_database. In addition a
+	 * streaming basebackup takes the same lock for the entirety of the copy
+	 * of the data directory.  That, combined with this check, prevents base
+	 * backups from being taken at the same time a SET TABLESPACE is in
+	 * progress.
+	 *
+	 * Note that this check here will not trigger if a standby currently has a
+	 * base backup ongoing; instead WAL replay will have a recovery conflict
+	 * when replaying the DBASE_CREATE record. That is only safe because
+	 * standbys can only do nonexclusive base backups which hold the lock over
+	 * the entire runtime - otherwise no lock would prevent replay after
+	 * pg_start_backup().
+	 */
+	if (LocalBaseBackupInProgress())
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("cannot change a database's tablespace while a base backup is in progress")));
+
+	/*
 	 * Get tablespace's oid
 	 */
 	dst_tblspcoid = get_tablespace_oid(tblspcname, false);
@@ -1116,6 +1143,7 @@ movedb(const char *dbname, const char *tblspcname)
 		heap_close(pgdbrel, NoLock);
 		UnlockSharedObjectForSession(DatabaseRelationId, db_id, 0,
 									 AccessExclusiveLock);
+		UnlockRelationOidForSession(DatabaseRelationId, RowExclusiveLock);
 		return;
 	}
 
@@ -1352,6 +1380,7 @@ movedb(const char *dbname, const char *tblspcname)
 	/* Now it's safe to release the database lock */
 	UnlockSharedObjectForSession(DatabaseRelationId, db_id, 0,
 								 AccessExclusiveLock);
+	UnlockRelationOidForSession(DatabaseRelationId, RowExclusiveLock);
 }
 
 /* Error cleanup callback for movedb */
@@ -2090,6 +2119,10 @@ dbase_redo(XLogReaderState *record)
 							   0);
 			StandbyAcquireLock(XLogRecGetXid(record), &locktag, AccessExclusiveLock);
 			ResolveRecoveryConflictWithDatabase(xlrec->src_db_id);
+
+			/* Lock pg_database, to conflict with base backups */
+			SET_LOCKTAG_RELATION(locktag, 0, DatabaseRelationId);
+			StandbyAcquireLock(XLogRecGetXid(record), &locktag, RowExclusiveLock);
 		}
 
 		/*
diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c
index 3058ce9..a0b590f 100644
--- a/src/backend/replication/basebackup.c
+++ b/src/backend/replication/basebackup.c
@@ -17,8 +17,10 @@
 #include <unistd.h>
 #include <time.h>
 
+#include "access/xact.h"
 #include "access/xlog_internal.h"		/* for pg_start/stop_backup */
 #include "catalog/catalog.h"
+#include "catalog/pg_database.h"
 #include "catalog/pg_type.h"
 #include "lib/stringinfo.h"
 #include "libpq/libpq.h"
@@ -32,6 +34,8 @@
 #include "replication/walsender_private.h"
 #include "storage/fd.h"
 #include "storage/ipc.h"
+#include "storage/lmgr.h"
+#include "storage/lock.h"
 #include "utils/builtins.h"
 #include "utils/elog.h"
 #include "utils/ps_status.h"
@@ -134,6 +138,9 @@ perform_base_backup(basebackup_options *opt, DIR *tblspcdir)
 
 	startptr = do_pg_start_backup(opt->label, opt->fastcheckpoint, &starttli,
 								  &labelfile);
+
+	LockRelationOidForSession(DatabaseRelationId, ShareLock);
+
 	/*
 	 * Once do_pg_start_backup has been called, ensure that any failure causes
 	 * us to abort the backup so we don't "leak" a backup counter. For this reason,
@@ -304,6 +311,9 @@ perform_base_backup(basebackup_options *opt, DIR *tblspcdir)
 	}
 	PG_END_ENSURE_ERROR_CLEANUP(base_backup_cleanup, (Datum) 0);
 
+	/* release lock preventing base backups - no risk while backing up WAL */
+	UnlockRelationOidForSession(DatabaseRelationId, ShareLock);
+
 	endptr = do_pg_stop_backup(labelfile, !opt->nowait, &endtli);
 
 	if (opt->includewal)
@@ -675,6 +685,9 @@ SendBaseBackup(BaseBackupCmd *cmd)
 		set_ps_display(activitymsg, false);
 	}
 
+	/* to provide error handling around locks */
+	StartTransactionCommand();
+
 	/* Make sure we can open the directory with tablespaces in it */
 	dir = AllocateDir("pg_tblspc");
 	if (!dir)
@@ -684,6 +697,8 @@ SendBaseBackup(BaseBackupCmd *cmd)
 	perform_base_backup(&opt, dir);
 
 	FreeDir(dir);
+
+	CommitTransactionCommand();
 }
 
 static void
diff --git a/src/backend/storage/lmgr/lmgr.c b/src/backend/storage/lmgr/lmgr.c
index d13a167..c428b38 100644
--- a/src/backend/storage/lmgr/lmgr.c
+++ b/src/backend/storage/lmgr/lmgr.c
@@ -248,6 +248,37 @@ UnlockRelation(Relation relation, LOCKMODE lockmode)
 }
 
 /*
+ *		LockRelationOidForSession
+ *
+ * Lock a relation in session mode, without requiring having it opened it in
+ * transaction local mode. That's likely only useful during WAL replay.
+ */
+void
+LockRelationOidForSession(Oid relid, LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SetLocktagRelationOid(&tag, relid);
+
+	(void) LockAcquire(&tag, lockmode, true, false);
+}
+
+/*
+ *		UnlockRelationOidForSession
+ *
+ * Unlock lock acquired by LockRelationOidForSession.
+ */
+void
+UnlockRelationOidForSession(Oid relid, LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SetLocktagRelationOid(&tag, relid);
+
+	LockRelease(&tag, lockmode, true);
+}
+
+/*
  *		LockHasWaitersRelation
  *
  * This is a functiion to check if someone else is waiting on a
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index 1f5cf06..dfe2322 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -886,7 +886,7 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username,
 			ereport(FATAL,
 					(errcode(ERRCODE_UNDEFINED_DATABASE),
 					 errmsg("database \"%s\" does not exist", dbname),
-			   errdetail("It seems to have just been dropped or renamed.")));
+			   errdetail("It seems to have just been dropped, moved or renamed.")));
 	}
 
 	/*
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 138deaf..f3893f3 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -253,6 +253,7 @@ extern XLogRecPtr do_pg_start_backup(const char *backupidstr, bool fast,
 extern XLogRecPtr do_pg_stop_backup(char *labelfile, bool waitforarchive,
 				  TimeLineID *stoptli_p);
 extern void do_pg_abort_backup(void);
+extern bool LocalBaseBackupInProgress(void);
 
 /* File path names (all relative to $PGDATA) */
 #define BACKUP_LABEL_FILE		"backup_label"
diff --git a/src/include/storage/lmgr.h b/src/include/storage/lmgr.h
index f5d70e5..2397f11 100644
--- a/src/include/storage/lmgr.h
+++ b/src/include/storage/lmgr.h
@@ -50,6 +50,9 @@ extern bool LockHasWaitersRelation(Relation relation, LOCKMODE lockmode);
 extern void LockRelationIdForSession(LockRelId *relid, LOCKMODE lockmode);
 extern void UnlockRelationIdForSession(LockRelId *relid, LOCKMODE lockmode);
 
+extern void LockRelationOidForSession(Oid relid, LOCKMODE lockmode);
+extern void UnlockRelationOidForSession(Oid relid, LOCKMODE lockmode);
+
 /* Lock a relation for extension */
 extern void LockRelationForExtension(Relation relation, LOCKMODE lockmode);
 extern void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode);
-- 
2.2.1.212.gc5b9256

