crash-recovery replay of CREATE TABLESPACE is broken in HEAD

Started by Tom Laneover 15 years ago8 messages
#1Tom Lane
tgl@sss.pgh.pa.us

I managed to crash the executor in the tablespace.sql test while working
on a 9.1 patch, and discovered that the postmaster fails to recover
from that. The end of postmaster.log looks like

LOG: all server processes terminated; reinitializing
LOG: database system was interrupted; last known up at 2010-07-11 19:30:07 EDT
LOG: database system was not properly shut down; automatic recovery in progress
LOG: consistent recovery state reached at 0/EE26F30
LOG: redo starts at 0/EE26F30
FATAL: directory "/home/postgres/pgsql/src/test/regress/testtablespace/PG_9.1_201004261" already in use as a tablespace
CONTEXT: xlog redo create ts: 127158 "/home/postgres/pgsql/src/test/regress/testtablespace"
LOG: startup process (PID 13914) exited with exit code 1
LOG: aborting startup due to startup process failure

It looks to me like those well-intentioned recent changes in this area
broke the crash-recovery case. Not good.

regards, tom lane

#2Bruce Momjian
bruce@momjian.us
In reply to: Tom Lane (#1)
1 attachment(s)
Re: crash-recovery replay of CREATE TABLESPACE is broken in HEAD

Tom Lane wrote:

I managed to crash the executor in the tablespace.sql test while working
on a 9.1 patch, and discovered that the postmaster fails to recover
from that. The end of postmaster.log looks like

LOG: all server processes terminated; reinitializing
LOG: database system was interrupted; last known up at 2010-07-11 19:30:07 EDT
LOG: database system was not properly shut down; automatic recovery in progress
LOG: consistent recovery state reached at 0/EE26F30
LOG: redo starts at 0/EE26F30
FATAL: directory "/home/postgres/pgsql/src/test/regress/testtablespace/PG_9.1_201004261" already in use as a tablespace
CONTEXT: xlog redo create ts: 127158 "/home/postgres/pgsql/src/test/regress/testtablespace"
LOG: startup process (PID 13914) exited with exit code 1
LOG: aborting startup due to startup process failure

It looks to me like those well-intentioned recent changes in this area
broke the crash-recovery case. Not good.

Sorry for the delay. I didn't realize this was my code that was broken
until Heikki told me via IM.

The bug is that we can't replay mkdir()/symlink() and assume those will
always succeed. I looked at the createdb redo code and it basically
drops the directory before creating it.

The tablespace directory/symlink setup is more complex, so I just wrote
the attached patch to trigger a redo-'delete' tablespace operation
before the create tablespace redo operation.

Ignoring mkdir/symlink creation failure is not an option because the
symlink might point to some wrong location or something.

--
Bruce Momjian <bruce@momjian.us> http://momjian.us
EnterpriseDB http://enterprisedb.com

+ None of us is going to be here forever. +

Attachments:

/pgpatches/tablespace_replay_fixtext/x-diffDownload
Index: src/backend/commands/tablespace.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/commands/tablespace.c,v
retrieving revision 1.77
diff -c -c -r1.77 tablespace.c
*** src/backend/commands/tablespace.c	18 Jul 2010 04:47:46 -0000	1.77
--- src/backend/commands/tablespace.c	18 Jul 2010 05:17:23 -0000
***************
*** 1355,1368 ****
  	/* Backup blocks are not used in tblspc records */
  	Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
  
! 	if (info == XLOG_TBLSPC_CREATE)
! 	{
! 		xl_tblspc_create_rec *xlrec = (xl_tblspc_create_rec *) XLogRecGetData(record);
! 		char	   *location = xlrec->ts_path;
! 
! 		create_tablespace_directories(location, xlrec->ts_id);
! 	}
! 	else if (info == XLOG_TBLSPC_DROP)
  	{
  		xl_tblspc_drop_rec *xlrec = (xl_tblspc_drop_rec *) XLogRecGetData(record);
  
--- 1355,1365 ----
  	/* Backup blocks are not used in tblspc records */
  	Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
  
! 	/*
! 	 *	If we are creating a tablespace during recovery, it is unclear
! 	 *	what state it is in, so potentially remove it before creating it.
! 	 */
! 	if (info == XLOG_TBLSPC_DROP || info == XLOG_TBLSPC_CREATE)
  	{
  		xl_tblspc_drop_rec *xlrec = (xl_tblspc_drop_rec *) XLogRecGetData(record);
  
***************
*** 1395,1400 ****
--- 1392,1407 ----
  	}
  	else
  		elog(PANIC, "tblspc_redo: unknown op code %u", info);
+ 
+ 	/* Now create the tablespace we perhaps just removed. */
+ 	if (info == XLOG_TBLSPC_CREATE)
+ 	{
+ 		xl_tblspc_create_rec *xlrec = (xl_tblspc_create_rec *) XLogRecGetData(record);
+ 		char	   *location = xlrec->ts_path;
+ 
+ 		create_tablespace_directories(location, xlrec->ts_id);
+ 	}
+ 
  }
  
  void
#3Heikki Linnakangas
heikki.linnakangas@enterprisedb.com
In reply to: Bruce Momjian (#2)
Re: crash-recovery replay of CREATE TABLESPACE is broken in HEAD

On 18/07/10 08:22, Bruce Momjian wrote:

The bug is that we can't replay mkdir()/symlink() and assume those will
always succeed. I looked at the createdb redo code and it basically
drops the directory before creating it.

The tablespace directory/symlink setup is more complex, so I just wrote
the attached patch to trigger a redo-'delete' tablespace operation
before the create tablespace redo operation.

Redoing a drop talespace assumes the tablespace directory is empty,
which it necessarily isn't when redoing a create tablespace command:

postgres=# CREATE TABLESPACE t LOCATION '/tmp/t';
CREATE TABLESPACE
postgres=# CREATE TABLE tfoo (id int4) TABLESPACE t;
CREATE TABLE
postgres=# \q
$ killall -9 postmaster
$ bin/postmaster -D data
LOG: database system was interrupted; last known up at 2010-07-18
08:48:32 EEST
LOG: database system was not properly shut down; automatic recovery in
progress
LOG: consistent recovery state reached at 0/5E889C
LOG: redo starts at 0/5E889C
FATAL: tablespace 16402 is not empty
CONTEXT: xlog redo create ts: 16402 "/tmp/t"
LOG: startup process (PID 5987) exited with exit code 1
LOG: aborting startup due to startup process failure

Also, casting the xl_tblspc_create_rec as xl_tblspc_drop_rec is a bit
questionable. It works because both structs begin with the tablespace
Oid, but it doesn't look right, and can break in hard-to-notice ways in
the future if the structure of those structs change in the future.

Ignoring mkdir/symlink creation failure is not an option because the
symlink might point to some wrong location or something.

Maybe you should check that it points to the right location? Or drop and
recreate the symlink, and ignore failure at mkdir.

--
Heikki Linnakangas
EnterpriseDB http://www.enterprisedb.com

#4Tom Lane
tgl@sss.pgh.pa.us
In reply to: Heikki Linnakangas (#3)
Re: crash-recovery replay of CREATE TABLESPACE is broken in HEAD

Heikki Linnakangas <heikki.linnakangas@enterprisedb.com> writes:

Maybe you should check that it points to the right location? Or drop and
recreate the symlink, and ignore failure at mkdir.

More specifically, ignore EEXIST failure when replaying mkdir. Anything
else is still a problem.

regards, tom lane

#5Bruce Momjian
bruce@momjian.us
In reply to: Tom Lane (#4)
1 attachment(s)
Re: crash-recovery replay of CREATE TABLESPACE is broken in HEAD

Tom Lane wrote:

Heikki Linnakangas <heikki.linnakangas@enterprisedb.com> writes:

Maybe you should check that it points to the right location? Or drop and
recreate the symlink, and ignore failure at mkdir.

More specifically, ignore EEXIST failure when replaying mkdir. Anything
else is still a problem.

Thanks for the help. I tried to find somewhere else in our recovery
code that was similar but didn't find anything.

The attached patch does as suggested. I added the recovery code to the
create tablespace function so I didn't have to duplicate all the code
that computes the path names.

Attached.

--
Bruce Momjian <bruce@momjian.us> http://momjian.us
EnterpriseDB http://enterprisedb.com

+ None of us is going to be here forever. +

Attachments:

/pgpatches/tablespace_replay_fixtext/x-diffDownload
Index: src/backend/commands/tablespace.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/commands/tablespace.c,v
retrieving revision 1.77
diff -c -c -r1.77 tablespace.c
*** src/backend/commands/tablespace.c	18 Jul 2010 04:47:46 -0000	1.77
--- src/backend/commands/tablespace.c	18 Jul 2010 15:53:34 -0000
***************
*** 568,578 ****
  	 */
  	if (mkdir(location_with_version_dir, S_IRWXU) < 0)
  	{
  		if (errno == EEXIST)
! 			ereport(ERROR,
! 					(errcode(ERRCODE_OBJECT_IN_USE),
! 					 errmsg("directory \"%s\" already in use as a tablespace",
! 							location_with_version_dir)));
  		else
  			ereport(ERROR,
  					(errcode_for_file_access(),
--- 568,582 ----
  	 */
  	if (mkdir(location_with_version_dir, S_IRWXU) < 0)
  	{
+ 		/* In recovery, directory might already exists, which is OK */
  		if (errno == EEXIST)
! 		{
! 			if (!InRecovery)
! 				ereport(ERROR,
! 						(errcode(ERRCODE_OBJECT_IN_USE),
! 						 errmsg("directory \"%s\" already in use as a tablespace",
! 								location_with_version_dir)));
! 		}
  		else
  			ereport(ERROR,
  					(errcode_for_file_access(),
***************
*** 580,585 ****
--- 584,599 ----
  							location_with_version_dir)));
  	}
  
+ 	/* Remove old symlink in recovery, in case it points to the wrong place */
+ 	if (InRecovery)
+ 	{
+ 		if (unlink(linkloc) < 0 && errno != ENOENT)
+ 			ereport(ERROR,
+ 					(errcode_for_file_access(),
+ 					 errmsg("could not remove symbolic link \"%s\": %m",
+ 							linkloc)));
+ 	}
+ 	
  	/*
  	 * Create the symlink under PGDATA
  	 */
#6Bruce Momjian
bruce@momjian.us
In reply to: Bruce Momjian (#5)
Re: crash-recovery replay of CREATE TABLESPACE is broken in HEAD

Bruce Momjian wrote:

Tom Lane wrote:

Heikki Linnakangas <heikki.linnakangas@enterprisedb.com> writes:

Maybe you should check that it points to the right location? Or drop and
recreate the symlink, and ignore failure at mkdir.

More specifically, ignore EEXIST failure when replaying mkdir. Anything
else is still a problem.

Thanks for the help. I tried to find somewhere else in our recovery
code that was similar but didn't find anything.

The attached patch does as suggested. I added the recovery code to the
create tablespace function so I didn't have to duplicate all the code
that computes the path names.

Attached.

Uh, another question. Looking at the createdb recovery, I see:

/*
* Our theory for replaying a CREATE is to forcibly drop the target
* subdirectory if present, then re-copy the source data. This may be
* more work than needed, but it is simple to implement.
*/
if (stat(dst_path, &st) == 0 && S_ISDIR(st.st_mode))
{
if (!rmtree(dst_path, true))
ereport(WARNING,
(errmsg("some useless files may be left behind in old database directory \"%s\"",
dst_path)));
}

Should I be using rmtree() on the mkdir target?

Also, the original tablespace recovery code did not drop the symlink
first. I assume that was not a bug only because we don't support moving
tablespaces:

- /* Create the symlink if not already present */
- linkloc = (char *) palloc(OIDCHARS + OIDCHARS + 1);
- sprintf(linkloc, "pg_tblspc/%u", xlrec->ts_id);
-
- if (symlink(location, linkloc) < 0)
- {
- if (errno != EEXIST)
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not create symbolic link \"%s\": %m",
- linkloc)));
- }

Still, it seems logical to unlink it before creating it.

--
Bruce Momjian <bruce@momjian.us> http://momjian.us
EnterpriseDB http://enterprisedb.com

+ None of us is going to be here forever. +

#7Bruce Momjian
bruce@momjian.us
In reply to: Bruce Momjian (#6)
1 attachment(s)
Re: crash-recovery replay of CREATE TABLESPACE is broken in HEAD

Bruce Momjian wrote:

The attached patch does as suggested. I added the recovery code to the
create tablespace function so I didn't have to duplicate all the code
that computes the path names.

Attached.

Uh, another question. Looking at the createdb recovery, I see:

/*
* Our theory for replaying a CREATE is to forcibly drop the target
* subdirectory if present, then re-copy the source data. This may be
* more work than needed, but it is simple to implement.
*/
if (stat(dst_path, &st) == 0 && S_ISDIR(st.st_mode))
{
if (!rmtree(dst_path, true))
ereport(WARNING,
(errmsg("some useless files may be left behind in old database directory \"%s\"",
dst_path)));
}

Should I be using rmtree() on the mkdir target?

Also, the original tablespace recovery code did not drop the symlink
first. I assume that was not a bug only because we don't support moving
tablespaces:

For consistency with CREATE DATABASE recovery and for reliablity, I
coded the rmtree() call instead. Patch attached.

--
Bruce Momjian <bruce@momjian.us> http://momjian.us
EnterpriseDB http://enterprisedb.com

+ None of us is going to be here forever. +

Attachments:

/pgpatches/tablespace_replay_fixtext/x-diffDownload
Index: src/backend/commands/tablespace.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/commands/tablespace.c,v
retrieving revision 1.77
diff -c -c -r1.77 tablespace.c
*** src/backend/commands/tablespace.c	18 Jul 2010 04:47:46 -0000	1.77
--- src/backend/commands/tablespace.c	19 Jul 2010 04:59:03 -0000
***************
*** 538,543 ****
--- 538,544 ----
  	char	   *linkloc = palloc(OIDCHARS + OIDCHARS + 1);
  	char	   *location_with_version_dir = palloc(strlen(location) + 1 +
  								   strlen(TABLESPACE_VERSION_DIRECTORY) + 1);
+ 	struct stat st;
  
  	sprintf(linkloc, "pg_tblspc/%u", tablespaceoid);
  	sprintf(location_with_version_dir, "%s/%s", location,
***************
*** 562,567 ****
--- 563,584 ----
  						 location)));
  	}
  
+ 	if (InRecovery)
+ 	{
+ 		/*
+ 		 * Our theory for replaying a CREATE is to forcibly drop the target
+ 		 * subdirectory if present, and then recreate it. This may be
+ 		 * more work than needed, but it is simple to implement.
+ 		 */
+ 		if (stat(location_with_version_dir, &st) == 0 && S_ISDIR(st.st_mode))
+ 		{
+ 			if (!rmtree(location_with_version_dir, true))
+ 				ereport(WARNING,
+ 						(errmsg("some useless files may be left behind in old database directory \"%s\"",
+ 								location_with_version_dir)));
+ 		}
+ 	}
+ 
  	/*
  	 * The creation of the version directory prevents more than one tablespace
  	 * in a single location.
***************
*** 580,585 ****
--- 597,612 ----
  							location_with_version_dir)));
  	}
  
+ 	/* Remove old symlink in recovery, in case it points to the wrong place */
+ 	if (InRecovery)
+ 	{
+ 		if (unlink(linkloc) < 0 && errno != ENOENT)
+ 			ereport(ERROR,
+ 					(errcode_for_file_access(),
+ 					 errmsg("could not remove symbolic link \"%s\": %m",
+ 							linkloc)));
+ 	}
+ 	
  	/*
  	 * Create the symlink under PGDATA
  	 */
#8Bruce Momjian
bruce@momjian.us
In reply to: Bruce Momjian (#7)
1 attachment(s)
Re: crash-recovery replay of CREATE TABLESPACE is broken in HEAD

Bruce Momjian wrote:

Bruce Momjian wrote:

The attached patch does as suggested. I added the recovery code to the
create tablespace function so I didn't have to duplicate all the code
that computes the path names.

Attached.

Uh, another question. Looking at the createdb recovery, I see:

/*
* Our theory for replaying a CREATE is to forcibly drop the target
* subdirectory if present, then re-copy the source data. This may be
* more work than needed, but it is simple to implement.
*/
if (stat(dst_path, &st) == 0 && S_ISDIR(st.st_mode))
{
if (!rmtree(dst_path, true))
ereport(WARNING,
(errmsg("some useless files may be left behind in old database directory \"%s\"",
dst_path)));
}

Should I be using rmtree() on the mkdir target?

Also, the original tablespace recovery code did not drop the symlink
first. I assume that was not a bug only because we don't support moving
tablespaces:

For consistency with CREATE DATABASE recovery and for reliablity, I
coded the rmtree() call instead. Patch attached.

Attached patch applied to HEAD and 9.0. 9.0 open item moved to
completed.

--
Bruce Momjian <bruce@momjian.us> http://momjian.us
EnterpriseDB http://enterprisedb.com

+ None of us is going to be here forever. +

Attachments:

/rtmp/difftext/x-diffDownload
Index: src/backend/commands/dbcommands.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/commands/dbcommands.c,v
retrieving revision 1.235
diff -c -c -r1.235 dbcommands.c
*** src/backend/commands/dbcommands.c	26 Feb 2010 02:00:38 -0000	1.235
--- src/backend/commands/dbcommands.c	20 Jul 2010 18:11:06 -0000
***************
*** 1908,1913 ****
--- 1908,1914 ----
  		if (stat(dst_path, &st) == 0 && S_ISDIR(st.st_mode))
  		{
  			if (!rmtree(dst_path, true))
+ 				/* If this failed, copydir() below is going to error. */
  				ereport(WARNING,
  						(errmsg("some useless files may be left behind in old database directory \"%s\"",
  								dst_path)));
Index: src/backend/commands/tablespace.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/commands/tablespace.c,v
retrieving revision 1.77
diff -c -c -r1.77 tablespace.c
*** src/backend/commands/tablespace.c	18 Jul 2010 04:47:46 -0000	1.77
--- src/backend/commands/tablespace.c	20 Jul 2010 18:11:07 -0000
***************
*** 562,567 ****
--- 562,586 ----
  						 location)));
  	}
  
+ 	if (InRecovery)
+ 	{
+ 		struct stat st;
+ 
+ 		/*
+ 		 * Our theory for replaying a CREATE is to forcibly drop the target
+ 		 * subdirectory if present, and then recreate it. This may be
+ 		 * more work than needed, but it is simple to implement.
+ 		 */
+ 		if (stat(location_with_version_dir, &st) == 0 && S_ISDIR(st.st_mode))
+ 		{
+ 			if (!rmtree(location_with_version_dir, true))
+ 				/* If this failed, mkdir() below is going to error. */
+ 				ereport(WARNING,
+ 						(errmsg("some useless files may be left behind in old database directory \"%s\"",
+ 								location_with_version_dir)));
+ 		}
+ 	}
+ 
  	/*
  	 * The creation of the version directory prevents more than one tablespace
  	 * in a single location.
***************
*** 580,585 ****
--- 599,614 ----
  							location_with_version_dir)));
  	}
  
+ 	/* Remove old symlink in recovery, in case it points to the wrong place */
+ 	if (InRecovery)
+ 	{
+ 		if (unlink(linkloc) < 0 && errno != ENOENT)
+ 			ereport(ERROR,
+ 					(errcode_for_file_access(),
+ 					 errmsg("could not remove symbolic link \"%s\": %m",
+ 							linkloc)));
+ 	}
+ 	
  	/*
  	 * Create the symlink under PGDATA
  	 */